# Import the Libraries
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
This dataset's objective is predicting whether an applicant's credit card approval will be approved or not.
The dataset contains applicants' basic information and applicant's credit history.
There are 438557 rows in application.csv. ID is from 5008804 to 6842885.
In credit_record.csv, there are 1048575 rows of 45985 ID's credit record. ID is from 5001711 to 5150487.
Data Dictionary:-
# Check the directory
os.getcwd()
'C:\\Users\\manee\\Data Science\\IPBA\\BYOP\\Final'
# Change the directory of the project
os.chdir('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model')
# Re-check the directory and confirm the changes
os.getcwd()
'D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model'
# Read the dataset application_record
app = pd.read_csv('application_record.csv')
app.head()
| ID | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | DAYS_BIRTH | DAYS_EMPLOYED | FLAG_MOBIL | FLAG_WORK_PHONE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008804 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 |
| 1 | 5008805 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 |
| 2 | 5008806 | M | Y | Y | 0 | 112500.00 | Working | Secondary / secondary special | Married | House / apartment | -21474 | -1134 | 1 | 0 | 0 | 0 | Security staff | 2.00 |
| 3 | 5008808 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 |
| 4 | 5008809 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 |
# Check the last few records of the dataset
app.tail()
| ID | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | DAYS_BIRTH | DAYS_EMPLOYED | FLAG_MOBIL | FLAG_WORK_PHONE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 438552 | 6840104 | M | N | Y | 0 | 135000.00 | Pensioner | Secondary / secondary special | Separated | House / apartment | -22717 | 365243 | 1 | 0 | 0 | 0 | NaN | 1.00 |
| 438553 | 6840222 | F | N | N | 0 | 103500.00 | Working | Secondary / secondary special | Single / not married | House / apartment | -15939 | -3007 | 1 | 0 | 0 | 0 | Laborers | 1.00 |
| 438554 | 6841878 | F | N | N | 0 | 54000.00 | Commercial associate | Higher education | Single / not married | With parents | -8169 | -372 | 1 | 1 | 0 | 0 | Sales staff | 1.00 |
| 438555 | 6842765 | F | N | Y | 0 | 72000.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -21673 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 |
| 438556 | 6842885 | F | N | Y | 0 | 121500.00 | Working | Secondary / secondary special | Married | House / apartment | -18858 | -1201 | 1 | 0 | 1 | 0 | Sales staff | 2.00 |
# Column names: convert to lower case
app = app.rename(columns = str.lower)
app.head()
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008804 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 |
| 1 | 5008805 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 |
| 2 | 5008806 | M | Y | Y | 0 | 112500.00 | Working | Secondary / secondary special | Married | House / apartment | -21474 | -1134 | 1 | 0 | 0 | 0 | Security staff | 2.00 |
| 3 | 5008808 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 |
| 4 | 5008809 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 |
# Check the dimensions of the dataset
app.shape
(438557, 18)
# Check the datatypes of the dataset
app.dtypes
id int64 code_gender object flag_own_car object flag_own_realty object cnt_children int64 amt_income_total float64 name_income_type object name_education_type object name_family_status object name_housing_type object days_birth int64 days_employed int64 flag_mobil int64 flag_work_phone int64 flag_phone int64 flag_email int64 occupation_type object cnt_fam_members float64 dtype: object
# Print information about the dataset
app.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 438557 entries, 0 to 438556 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 438557 non-null int64 1 code_gender 438557 non-null object 2 flag_own_car 438557 non-null object 3 flag_own_realty 438557 non-null object 4 cnt_children 438557 non-null int64 5 amt_income_total 438557 non-null float64 6 name_income_type 438557 non-null object 7 name_education_type 438557 non-null object 8 name_family_status 438557 non-null object 9 name_housing_type 438557 non-null object 10 days_birth 438557 non-null int64 11 days_employed 438557 non-null int64 12 flag_mobil 438557 non-null int64 13 flag_work_phone 438557 non-null int64 14 flag_phone 438557 non-null int64 15 flag_email 438557 non-null int64 16 occupation_type 304354 non-null object 17 cnt_fam_members 438557 non-null float64 dtypes: float64(2), int64(8), object(8) memory usage: 60.2+ MB
# Generate statistical summary of the continuous variables of the dataset and transpose it
app.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 438557.00 | 6022176.27 | 571637.02 | 5008804.00 | 5609375.00 | 6047745.00 | 6456971.00 | 7999952.00 |
| cnt_children | 438557.00 | 0.43 | 0.72 | 0.00 | 0.00 | 0.00 | 1.00 | 19.00 |
| amt_income_total | 438557.00 | 187524.29 | 110086.85 | 26100.00 | 121500.00 | 160780.50 | 225000.00 | 6750000.00 |
| days_birth | 438557.00 | -15997.90 | 4185.03 | -25201.00 | -19483.00 | -15630.00 | -12514.00 | -7489.00 |
| days_employed | 438557.00 | 60563.68 | 138767.80 | -17531.00 | -3103.00 | -1467.00 | -371.00 | 365243.00 |
| flag_mobil | 438557.00 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| flag_work_phone | 438557.00 | 0.21 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| flag_phone | 438557.00 | 0.29 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| flag_email | 438557.00 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| cnt_fam_members | 438557.00 | 2.19 | 0.90 | 1.00 | 2.00 | 2.00 | 3.00 | 20.00 |
# Generate statistical summary of the continuous and discrete variables of the dataset and transpose it
app.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 438557.00 | NaN | NaN | NaN | 6022176.27 | 571637.02 | 5008804.00 | 5609375.00 | 6047745.00 | 6456971.00 | 7999952.00 |
| code_gender | 438557 | 2 | F | 294440 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| flag_own_car | 438557 | 2 | N | 275459 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| flag_own_realty | 438557 | 2 | Y | 304074 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| cnt_children | 438557.00 | NaN | NaN | NaN | 0.43 | 0.72 | 0.00 | 0.00 | 0.00 | 1.00 | 19.00 |
| amt_income_total | 438557.00 | NaN | NaN | NaN | 187524.29 | 110086.85 | 26100.00 | 121500.00 | 160780.50 | 225000.00 | 6750000.00 |
| name_income_type | 438557 | 5 | Working | 226104 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| name_education_type | 438557 | 5 | Secondary / secondary special | 301821 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| name_family_status | 438557 | 5 | Married | 299828 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| name_housing_type | 438557 | 6 | House / apartment | 393831 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| days_birth | 438557.00 | NaN | NaN | NaN | -15997.90 | 4185.03 | -25201.00 | -19483.00 | -15630.00 | -12514.00 | -7489.00 |
| days_employed | 438557.00 | NaN | NaN | NaN | 60563.68 | 138767.80 | -17531.00 | -3103.00 | -1467.00 | -371.00 | 365243.00 |
| flag_mobil | 438557.00 | NaN | NaN | NaN | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| flag_work_phone | 438557.00 | NaN | NaN | NaN | 0.21 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| flag_phone | 438557.00 | NaN | NaN | NaN | 0.29 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| flag_email | 438557.00 | NaN | NaN | NaN | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| occupation_type | 304354 | 18 | Laborers | 78240 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| cnt_fam_members | 438557.00 | NaN | NaN | NaN | 2.19 | 0.90 | 1.00 | 2.00 | 2.00 | 3.00 | 20.00 |
# Check the total count of unique values of all the variables in the dataset
app.nunique()
id 438510 code_gender 2 flag_own_car 2 flag_own_realty 2 cnt_children 12 amt_income_total 866 name_income_type 5 name_education_type 5 name_family_status 5 name_housing_type 6 days_birth 16379 days_employed 9406 flag_mobil 1 flag_work_phone 2 flag_phone 2 flag_email 2 occupation_type 18 cnt_fam_members 13 dtype: int64
# Find the missing values of all the variables in the dataset
app.isnull().sum().sort_values(ascending = False)
occupation_type 134203 id 0 code_gender 0 flag_email 0 flag_phone 0 flag_work_phone 0 flag_mobil 0 days_employed 0 days_birth 0 name_housing_type 0 name_family_status 0 name_education_type 0 name_income_type 0 amt_income_total 0 cnt_children 0 flag_own_realty 0 flag_own_car 0 cnt_fam_members 0 dtype: int64
(app.isnull().sum() / len(app) * 100).sort_values(ascending = False)
occupation_type 30.60 id 0.00 code_gender 0.00 flag_email 0.00 flag_phone 0.00 flag_work_phone 0.00 flag_mobil 0.00 days_employed 0.00 days_birth 0.00 name_housing_type 0.00 name_family_status 0.00 name_education_type 0.00 name_income_type 0.00 amt_income_total 0.00 cnt_children 0.00 flag_own_realty 0.00 flag_own_car 0.00 cnt_fam_members 0.00 dtype: float64
Insights:-
NOTE:-
# Analysing variables containing null values
# Threshold: 30%
null_var = app.isnull().sum().sort_values(ascending = False)
null_var
occupation_type 134203 id 0 code_gender 0 flag_email 0 flag_phone 0 flag_work_phone 0 flag_mobil 0 days_employed 0 days_birth 0 name_housing_type 0 name_family_status 0 name_education_type 0 name_income_type 0 amt_income_total 0 cnt_children 0 flag_own_realty 0 flag_own_car 0 cnt_fam_members 0 dtype: int64
null_var30 = null_var[null_var.values > (0.30 * len(app))]
null_var30
occupation_type 134203 dtype: int64
Insights:-
# Plotting Bar Graph for null values greater than 30%
plt.figure(figsize = (5, 5))
null_var30.plot(kind = 'bar', color = "#4CB391")
plt.title('List of Columns & null counts where null values are more than 30%')
plt.xlabel("Null Columns", fontdict = {"fontsize": 12, "fontweight": 5}) #Setting X-label and Y-label
plt.ylabel("Count of null values", fontdict = {"fontsize": 12, "fontweight": 5})
plt.show()
# Analysing variables containing null values
# Threshold: 35%
null_var35 = null_var[null_var.values > (0.35 * len(app))]
null_var35
Series([], dtype: int64)
Insights:-
# Check the total number of columns having null values greater than 30%
len(null_var30)
1
# List the column name having null values greater than 30%
col_names = list(null_var30.index.values)
col_names
# app.drop(labels = col_names, axis = 1, inplace = True) #Droping those columns
['occupation_type']
Insights:-
# Check the total number of columns having null values greater than 30%
len(null_var35)
0
Insights:-
app.shape
(438557, 18)
# After removing null values, check the percentage of null values for each column again.
null = (app.isnull().sum() / len(app) * 100).sort_values(ascending = False)
null
occupation_type 30.60 id 0.00 code_gender 0.00 flag_email 0.00 flag_phone 0.00 flag_work_phone 0.00 flag_mobil 0.00 days_employed 0.00 days_birth 0.00 name_housing_type 0.00 name_family_status 0.00 name_education_type 0.00 name_income_type 0.00 amt_income_total 0.00 cnt_children 0.00 flag_own_realty 0.00 flag_own_car 0.00 cnt_fam_members 0.00 dtype: float64
Insights:-
# Check sample of duplicate records by combining DAYS_EMPLOYED and DAYS_BIRTH
app.loc[app.days_employed == -1194].loc[app.days_birth == -17778]
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13 | 5008819 | M | Y | Y | 0 | 135000.00 | Commercial associate | Secondary / secondary special | Married | House / apartment | -17778 | -1194 | 1 | 0 | 0 | 0 | Laborers | 2.00 |
| 14 | 5008820 | M | Y | Y | 0 | 135000.00 | Commercial associate | Secondary / secondary special | Married | House / apartment | -17778 | -1194 | 1 | 0 | 0 | 0 | Laborers | 2.00 |
| 15 | 5008821 | M | Y | Y | 0 | 135000.00 | Commercial associate | Secondary / secondary special | Married | House / apartment | -17778 | -1194 | 1 | 0 | 0 | 0 | Laborers | 2.00 |
| 16 | 5008822 | M | Y | Y | 0 | 135000.00 | Commercial associate | Secondary / secondary special | Married | House / apartment | -17778 | -1194 | 1 | 0 | 0 | 0 | Laborers | 2.00 |
| 17 | 5008823 | M | Y | Y | 0 | 135000.00 | Commercial associate | Secondary / secondary special | Married | House / apartment | -17778 | -1194 | 1 | 0 | 0 | 0 | Laborers | 2.00 |
| 18 | 5008824 | M | Y | Y | 0 | 135000.00 | Commercial associate | Secondary / secondary special | Married | House / apartment | -17778 | -1194 | 1 | 0 | 0 | 0 | Laborers | 2.00 |
# dropping duplicate values
# app = app.drop_duplicates(subset = app.columns[1:], keep = 'first', inplace = False)
# app.head()
# app.shape
This is a csv file with credit record for a part of ID in application record. We can treat it a file to generate labels for modeling. For the applicants who have a record more than 59 past due, they should be rejected.
Data Dictionary:-
X: No loan for the month
C: paid off that month
0: 1-29 days past due
1: 30-59 days past due
2: 60-89 days overdue
3: 90-119 days overdue
4: 120-149 days overdue
5: Overdue or bad debts, write-offs for more than 150 days
# Read the dataset credit_record
cred = pd.read_csv('credit_record.csv')
cred.head()
| ID | MONTHS_BALANCE | STATUS | |
|---|---|---|---|
| 0 | 5001711 | 0 | X |
| 1 | 5001711 | -1 | 0 |
| 2 | 5001711 | -2 | 0 |
| 3 | 5001711 | -3 | 0 |
| 4 | 5001712 | 0 | C |
# View the last few records of the dataset
cred.tail()
| ID | MONTHS_BALANCE | STATUS | |
|---|---|---|---|
| 1048570 | 5150487 | -25 | C |
| 1048571 | 5150487 | -26 | C |
| 1048572 | 5150487 | -27 | C |
| 1048573 | 5150487 | -28 | C |
| 1048574 | 5150487 | -29 | C |
# Column names: convert to lower case
cred = cred.rename(columns = str.lower)
cred.head()
| id | months_balance | status | |
|---|---|---|---|
| 0 | 5001711 | 0 | X |
| 1 | 5001711 | -1 | 0 |
| 2 | 5001711 | -2 | 0 |
| 3 | 5001711 | -3 | 0 |
| 4 | 5001712 | 0 | C |
# Check the dimensions of the dataset
cred.shape
(1048575, 3)
# Check the datatypes of the dataset
cred.dtypes
id int64 months_balance int64 status object dtype: object
# Print information about the dataset
cred.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1048575 entries, 0 to 1048574 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 1048575 non-null int64 1 months_balance 1048575 non-null int64 2 status 1048575 non-null object dtypes: int64(2), object(1) memory usage: 24.0+ MB
# Generate statistical summary of the continuous variables of the dataset and transpose it
cred.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 1048575.00 | 5068286.42 | 46150.58 | 5001711.00 | 5023644.00 | 5062104.00 | 5113856.00 | 5150487.00 |
| months_balance | 1048575.00 | -19.14 | 14.02 | -60.00 | -29.00 | -17.00 | -7.00 | 0.00 |
# Generate statistical summary of the continuous and discrete variables of the dataset and transpose it
cred.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1048575.00 | NaN | NaN | NaN | 5068286.42 | 46150.58 | 5001711.00 | 5023644.00 | 5062104.00 | 5113856.00 | 5150487.00 |
| months_balance | 1048575.00 | NaN | NaN | NaN | -19.14 | 14.02 | -60.00 | -29.00 | -17.00 | -7.00 | 0.00 |
| status | 1048575 | 8 | C | 442031 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Find the total count of unique records
cred.nunique()
id 45985 months_balance 61 status 8 dtype: int64
# Find the total number of missing values in every variable
cred.isnull().sum()
id 0 months_balance 0 status 0 dtype: int64
# Replace X, and C by 0 of the status variable
# We will be considering '0' as Good Customer for our analysis and '1' as Bad Customer (in the status variable)
cred.status.replace('X', 0, inplace = True)
cred.status.replace('C', 0, inplace = True)
cred.head()
| id | months_balance | status | |
|---|---|---|---|
| 0 | 5001711 | 0 | 0 |
| 1 | 5001711 | -1 | 0 |
| 2 | 5001711 | -2 | 0 |
| 3 | 5001711 | -3 | 0 |
| 4 | 5001712 | 0 | 0 |
# Change the datatype of status variable to 'int'
cred.status = cred.status.astype('int')
# Re-check the datatypes of cred dataset
cred.dtypes
id int64 months_balance int64 status int32 dtype: object
# Verify the records on the basis of the status value
cred.loc[cred.status == 3]
| id | months_balance | status | |
|---|---|---|---|
| 8912 | 5002126 | -47 | 3 |
| 8913 | 5002126 | -48 | 3 |
| 34911 | 5003267 | -50 | 3 |
| 34912 | 5003267 | -51 | 3 |
| 44246 | 5003712 | -23 | 3 |
| ... | ... | ... | ... |
| 1020098 | 5148602 | -5 | 3 |
| 1025698 | 5148932 | -46 | 3 |
| 1025743 | 5148934 | -13 | 3 |
| 1040956 | 5149834 | -13 | 3 |
| 1041046 | 5149838 | -22 | 3 |
320 rows × 3 columns
# Retrieve the frequency of the status variable
cred.status.value_counts()
0 1034381 1 11090 5 1693 2 868 3 320 4 223 Name: status, dtype: int64
# Group the credit_recond dataset according to the 'id' variable to remove the similar 'id' records
# Also we will take the worst credit record of an applicant (i.e) the maximum value of the status variable against the
# applicant's id.
# In this filteration the other values of the staus will be discarded and only the highest values of the status will be
# considered.
cred = cred.groupby('id').status.max()
# View the first few records
cred.head(10)
id 5001711 0 5001712 0 5001713 0 5001714 0 5001715 0 5001717 0 5001718 1 5001719 0 5001720 1 5001723 0 Name: status, dtype: int32
# View the last few records
cred.tail()
id 5150482 0 5150483 0 5150484 0 5150485 0 5150487 0 Name: status, dtype: int32
# View the entire records
cred
id
5001711 0
5001712 0
5001713 0
5001714 0
5001715 0
..
5150482 0
5150483 0
5150484 0
5150485 0
5150487 0
Name: status, Length: 45985, dtype: int32
# Retrieve the frequency of the status variable after removing the duplicate ids.
cred.value_counts()
0 40635 1 4683 2 336 5 195 3 88 4 48 Name: status, dtype: int64
# Merge the two datasets : app and cred
df = pd.merge(app, cred, how = 'inner', on = ['id'])
# View the first few records
df.head(10)
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008804 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 | 1 |
| 1 | 5008805 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 | 1 |
| 2 | 5008806 | M | Y | Y | 0 | 112500.00 | Working | Secondary / secondary special | Married | House / apartment | -21474 | -1134 | 1 | 0 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 5008808 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 4 | 5008809 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 5 | 5008810 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 6 | 5008811 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 7 | 5008812 | F | N | Y | 0 | 283500.00 | Pensioner | Higher education | Separated | House / apartment | -22464 | 365243 | 1 | 0 | 0 | 0 | NaN | 1.00 | 0 |
| 8 | 5008813 | F | N | Y | 0 | 283500.00 | Pensioner | Higher education | Separated | House / apartment | -22464 | 365243 | 1 | 0 | 0 | 0 | NaN | 1.00 | 0 |
| 9 | 5008814 | F | N | Y | 0 | 283500.00 | Pensioner | Higher education | Separated | House / apartment | -22464 | 365243 | 1 | 0 | 0 | 0 | NaN | 1.00 | 0 |
# Retrieve the frequency of the status variable after merging the datasets.
df.status.value_counts()
0 32166 1 3675 2 314 5 180 3 76 4 46 Name: status, dtype: int64
# Verify any random id of an applicant to confirm if the highest/worst values of his status is chosen and other duplicate ids
# are removed.
df.loc[df.id == 5137203]
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 32520 | 5137203 | M | N | N | 0 | 90000.00 | Working | Secondary / secondary special | Single / not married | With parents | -9900 | -170 | 1 | 1 | 0 | 0 | Drivers | 1.00 | 4 |
# Check the dimension of the merged dataset
df.shape
(36457, 19)
# Need to remove other values of status variable and required to keep on '0's and '1's.
# Any value in the status variable that is equal to 2 and above will be converted to '1's and below 2 will be converted to '0's.
# '0's means - Good Customers (including customers that are 0-29 days past due date)
# '1's means - Bad Customers
df.status = df.status.apply(lambda x: 1 if x >= 1 else 0)
df.head(10)
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5008804 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 | 1 |
| 1 | 5008805 | M | Y | Y | 0 | 427500.00 | Working | Higher education | Civil marriage | Rented apartment | -12005 | -4542 | 1 | 1 | 0 | 0 | NaN | 2.00 | 1 |
| 2 | 5008806 | M | Y | Y | 0 | 112500.00 | Working | Secondary / secondary special | Married | House / apartment | -21474 | -1134 | 1 | 0 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 5008808 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 4 | 5008809 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 5 | 5008810 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 6 | 5008811 | F | N | Y | 0 | 270000.00 | Commercial associate | Secondary / secondary special | Single / not married | House / apartment | -19110 | -3051 | 1 | 0 | 1 | 1 | Sales staff | 1.00 | 0 |
| 7 | 5008812 | F | N | Y | 0 | 283500.00 | Pensioner | Higher education | Separated | House / apartment | -22464 | 365243 | 1 | 0 | 0 | 0 | NaN | 1.00 | 0 |
| 8 | 5008813 | F | N | Y | 0 | 283500.00 | Pensioner | Higher education | Separated | House / apartment | -22464 | 365243 | 1 | 0 | 0 | 0 | NaN | 1.00 | 0 |
| 9 | 5008814 | F | N | Y | 0 | 283500.00 | Pensioner | Higher education | Separated | House / apartment | -22464 | 365243 | 1 | 0 | 0 | 0 | NaN | 1.00 | 0 |
# View last few records
df.tail(10)
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36447 | 5149145 | M | Y | Y | 0 | 247500.00 | Working | Secondary / secondary special | Married | House / apartment | -10952 | -3577 | 1 | 1 | 0 | 0 | Laborers | 2.00 | 1 |
| 36448 | 5149158 | M | Y | Y | 0 | 247500.00 | Working | Secondary / secondary special | Married | House / apartment | -10952 | -3577 | 1 | 1 | 0 | 0 | Laborers | 2.00 | 1 |
| 36449 | 5149190 | M | Y | N | 1 | 450000.00 | Working | Higher education | Married | House / apartment | -9847 | -502 | 1 | 0 | 1 | 1 | Core staff | 3.00 | 1 |
| 36450 | 5149729 | M | Y | Y | 0 | 90000.00 | Working | Secondary / secondary special | Married | House / apartment | -19101 | -1721 | 1 | 0 | 0 | 0 | NaN | 2.00 | 1 |
| 36451 | 5149775 | F | Y | Y | 0 | 130500.00 | Working | Secondary / secondary special | Married | House / apartment | -16137 | -9391 | 1 | 0 | 1 | 0 | Laborers | 2.00 | 1 |
| 36452 | 5149828 | M | Y | Y | 0 | 315000.00 | Working | Secondary / secondary special | Married | House / apartment | -17348 | -2420 | 1 | 0 | 0 | 0 | Managers | 2.00 | 1 |
| 36453 | 5149834 | F | N | Y | 0 | 157500.00 | Commercial associate | Higher education | Married | House / apartment | -12387 | -1325 | 1 | 0 | 1 | 1 | Medicine staff | 2.00 | 1 |
| 36454 | 5149838 | F | N | Y | 0 | 157500.00 | Pensioner | Higher education | Married | House / apartment | -12387 | -1325 | 1 | 0 | 1 | 1 | Medicine staff | 2.00 | 1 |
| 36455 | 5150049 | F | N | Y | 0 | 283500.00 | Working | Secondary / secondary special | Married | House / apartment | -17958 | -655 | 1 | 0 | 0 | 0 | Sales staff | 2.00 | 1 |
| 36456 | 5150337 | M | N | Y | 0 | 112500.00 | Working | Secondary / secondary special | Single / not married | Rented apartment | -9188 | -1193 | 1 | 0 | 0 | 0 | Laborers | 1.00 | 1 |
# Check the dimensions
df.shape
(36457, 19)
# Verify the same id of an applicant (the one we checked earlier above) to confirm if the status values of 2 and above are
# converted to '1's or not.
# In this example id = 5137203 earlier had the status value of 4.
# But after applying the '1's and '0's functions, we see that it has converted to '1'.
df.loc[df.id == 5137203]
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 32520 | 5137203 | M | N | N | 0 | 90000.00 | Working | Secondary / secondary special | Single / not married | With parents | -9900 | -170 | 1 | 1 | 0 | 0 | Drivers | 1.00 | 1 |
# Retrieve the frequency of the status variable after converting to '1's and '0's.
# 1 means Rejected applicants
# 0 means Accepted applicants
df.status.value_counts()
0 32166 1 4291 Name: status, dtype: int64
4291 applicants are rejected.
# Sort the dataset according to the 'amt_income_total' variable.
df = df.sort_values('amt_income_total')
df.head()
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30846 | 5126175 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 30845 | 5126174 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 34066 | 5143327 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 32169 | 5135923 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 32170 | 5135925 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# Reset the index
df = df.reset_index(drop = True)
df.head()
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5126175 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 5126174 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 5143327 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 5135923 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 5135925 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# View the last few records
df.tail()
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36452 | 5143234 | F | Y | Y | 1 | 1575000.00 | Commercial associate | Higher education | Single / not married | House / apartment | -10142 | -2479 | 1 | 0 | 0 | 0 | Managers | 2.00 | 0 |
| 36453 | 5143235 | F | Y | Y | 1 | 1575000.00 | Commercial associate | Higher education | Single / not married | House / apartment | -10142 | -2479 | 1 | 0 | 0 | 0 | Managers | 2.00 | 0 |
| 36454 | 5143236 | F | Y | Y | 1 | 1575000.00 | Commercial associate | Higher education | Single / not married | House / apartment | -10142 | -2479 | 1 | 0 | 0 | 0 | Managers | 2.00 | 0 |
| 36455 | 5143237 | F | Y | Y | 1 | 1575000.00 | Commercial associate | Higher education | Single / not married | House / apartment | -10142 | -2479 | 1 | 0 | 0 | 0 | Managers | 2.00 | 0 |
| 36456 | 5143238 | F | Y | Y | 1 | 1575000.00 | Commercial associate | Higher education | Single / not married | House / apartment | -10142 | -2479 | 1 | 0 | 0 | 0 | Managers | 2.00 | 0 |
# Replace the 'id' variable with the in-built numeric index values.
df.id = df.index
df.head()
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# View the dimensions of the dataset
df.shape
(36457, 19)
# Save the current dataset as csv
df.to_csv('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\merged_data.csv', index = False)
# Print the rejection rate
print('There are ' + str(df.status.sum()) + ' rejected applicants.\n',
str(round(df.status.sum() / len(df) * 100, 2)) + '% in 36457 applicants.')
There are 4291 rejected applicants. 11.77% in 36457 applicants.
# Separate the Good applicants and the Bad applicants
# Good applicants
status0 = df.loc[df["status"] == 0]
status0.shape[0]
32166
# Bad applicants
status1 = df.loc[df["status"] == 1]
status1.shape[0]
4291
# Calculate the imbalance ratio
round(len(status0)/len(status1), 2)
7.5
The Imbalance ratio we got is "7.5"
# Let’s check the distribution of the target variable (status) visually using a pie chart.
count1 = 0
count0 = 0
for i in df['status'].values:
if i == 1:
count1 += 1
else:
count0 += 1
count1
4291
count0
32166
count1_perc = (count1 / len(df['status'])) * 100
count0_perc = (count0 / len(df['status'])) * 100
count1_perc
11.77003044682777
count0_perc
88.22996955317223
# Imbalance Ratio
imbalance_ratio = print(str(round(count0_perc / count1_perc, 2)))
7.5
x = ['Bad Applicants (status = 1)', 'Good Applicants (status = 0)']
y = [count1_perc, count0_perc]
explode = (0.15, 0) # only "explode" the 1st slice
colors = ['#ff9999','#99ff99']
fig1, ax1 = plt.subplots(figsize = (8,8))
ax1.pie(y, explode = explode, labels = x, colors=colors, autopct = '%1.2f%%',
shadow = True, startangle = 110, textprops = {'fontsize': 15})
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.suptitle('Data Imbalance', fontsize = 28)
#plt.title('(Status variable)')
plt.show()
Insights:-
# Correlation of merged dataset df
plt.figure(figsize = (25, 22), dpi = 80, facecolor = 'white', edgecolor = 'k')
sns.set(font_scale = 2)
hm = sns.heatmap(df.corr(), annot = True, vmin = -1, vmax = 1, cmap = 'coolwarm', fmt = '.2f',
cbar_kws = {"shrink": .82, 'label': 'Correlation %'},
annot_kws = {"size": 18}, linewidths = 0.1, linecolor = 'white', square = True)
plt.title('Correlation matrix of Merged Data (df)\n')
hm.set(xlabel = '\nApplicants Details', ylabel = 'Applicants Details\n')
hm.set_xticklabels(hm.get_xmajorticklabels(), fontsize = 18, rotation = 45)
hm.set_yticklabels(hm.get_ymajorticklabels(), fontsize = 18)
plt.savefig('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Credit Card Approval\\Temp\\ver 5\\Plotting_Correlation_HeatMap1.jpg')
plt.show()
Insights:-
There are 7 binary features in a dataset 'df':-
Note:-
binary_df = df.copy()
binary_df.head()
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_mobil | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 1 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.shape
(36457, 19)
# Convert the datatypes to category data-type
binary_df['code_gender'] = binary_df['code_gender'].astype('category')
binary_df['flag_own_car'] = binary_df['flag_own_car'].astype('category')
binary_df['flag_own_realty'] = binary_df['flag_own_realty'].astype('category')
binary_df['flag_mobil'] = binary_df['flag_mobil'].astype('category')
binary_df['flag_work_phone'] = binary_df['flag_work_phone'].astype('category')
binary_df['flag_phone'] = binary_df['flag_phone'].astype('category')
binary_df['flag_email'] = binary_df['flag_email'].astype('category')
binary_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 36457 non-null int64 1 code_gender 36457 non-null category 2 flag_own_car 36457 non-null category 3 flag_own_realty 36457 non-null category 4 cnt_children 36457 non-null int64 5 amt_income_total 36457 non-null float64 6 name_income_type 36457 non-null object 7 name_education_type 36457 non-null object 8 name_family_status 36457 non-null object 9 name_housing_type 36457 non-null object 10 days_birth 36457 non-null int64 11 days_employed 36457 non-null int64 12 flag_mobil 36457 non-null category 13 flag_work_phone 36457 non-null category 14 flag_phone 36457 non-null category 15 flag_email 36457 non-null category 16 occupation_type 25134 non-null object 17 cnt_fam_members 36457 non-null float64 18 status 36457 non-null int64 dtypes: category(7), float64(2), int64(5), object(5) memory usage: 3.6+ MB
# Reason for dropping 'flag_mobil' column
pd.crosstab(df['flag_mobil'], df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_mobil | |||
| 1 | 32166 | 4291 | 36457 |
| All | 32166 | 4291 | 36457 |
Insights:-
* There is only one category of '1's, which means that every applicant has a mobile phone.
* Therefore, flag_mobil variable will be dropped as it is not significant for the model building.
# Drop the 'flag_mobil' variable
binary_df = binary_df.drop(['flag_mobil'], axis = 1)
binary_df.head()
| id | code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# Drop the Id variable as it is not significant
binary_df = binary_df.drop(['id'], axis = 1)
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.shape
(36457, 17)
binary_df.code_gender.value_counts()
F 24430 M 12027 Name: code_gender, dtype: int64
# Use crosstabs
pd.crosstab(binary_df['code_gender'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| code_gender | |||
| F | 21672 | 2758 | 24430 |
| M | 10494 | 1533 | 12027 |
| All | 32166 | 4291 | 36457 |
# Genderwise rejection break-up
gender_rej_perc = (binary_df.groupby('code_gender')['status']
.value_counts(normalize = True)
.reset_index(name = 'perc'))
gender_rej_perc
| code_gender | status | perc | |
|---|---|---|---|
| 0 | F | 0 | 0.89 |
| 1 | F | 1 | 0.11 |
| 2 | M | 0 | 0.87 |
| 3 | M | 1 | 0.13 |
# Gender distribution on the basis of Good applicants only
# Count
#status0_gen_c = status0.code_gender.value_counts()
#status0_gen_c
status0_gen_c = binary_df.loc[binary_df["status"] == 0] #.code_gender.value_counts()
status0_gen_c.code_gender.value_counts()
F 21672 M 10494 Name: code_gender, dtype: int64
# Total Male and Female gender distribution
plt.subplots(figsize = (8, 8))
sns.countplot(binary_df['code_gender'])
<AxesSubplot:xlabel='code_gender', ylabel='count'>
# Percentage
status0_gen_p = binary_df.loc[binary_df["status"] == 0]
status0_gen_p.code_gender.value_counts(normalize = True)
F 0.67 M 0.33 Name: code_gender, dtype: float64
# Gender distribution on the basis of Bad applicants only
# Count
status1_gen_c = binary_df.loc[binary_df["status"] == 1] #.code_gender.value_counts()
status1_gen_c.code_gender.value_counts()
F 2758 M 1533 Name: code_gender, dtype: int64
# Percentage
status1_gen_p = binary_df.loc[binary_df["status"] == 1]
status1_gen_p.code_gender.value_counts(normalize = True)
F 0.64 M 0.36 Name: code_gender, dtype: float64
plt.figure(figsize = (18, 20))
plt.subplot(221)
sns.countplot(x = 'status', hue = 'code_gender', data = status0, palette = 'Set2')
plt.title("Gender Distribution in Good Applicants\n")
plt.subplot(222)
sns.countplot(x = 'status', hue = 'code_gender', data = status1, palette = 'Set2')
plt.title("Gender Distribution in Bad Applicants\n")
plt.show()
Insights:-
# Check the status count of rejection and acceptance on the basis of gender
gender_df = binary_df.groupby(["code_gender", 'status'])["status"].count()
gender_df
code_gender status
F 0 21672
1 2758
M 0 10494
1 1533
Name: status, dtype: int64
# Total rejection count of Males
gender_df_m = binary_df.loc[binary_df.status == 1].loc[binary_df.code_gender == 'M']
gender_df_m.shape[0]
1533
# Total rejection count of Females
gender_df_f = binary_df.loc[binary_df.status == 1].loc[binary_df.code_gender == 'F']
gender_df_f.shape[0]
2758
# Total rejections
gender_tot = gender_df_f.shape[0] + gender_df_m.shape[0]
gender_tot
4291
# Total eligibles
gender_df_m_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.code_gender == 'M']
print("Total Eligible Males: " + str(gender_df_m_eleg.shape[0]))
gender_df_f_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.code_gender == 'F']
print("Total Eligible Females: " + str(gender_df_f_eleg.shape[0]))
gender_eleg = gender_df_f_eleg.shape[0] + gender_df_m_eleg.shape[0]
print("Total Eligible applicants : " + str(gender_eleg))
Total Eligible Males: 10494 Total Eligible Females: 21672 Total Eligible applicants : 32166
# Percencatage of rejection of Males out of total rejections
print('There are ' + str(gender_tot) + ' rejected applicants.')
print('Out of this:-')
print('Males are', gender_df_m.shape[0])
print('Females are', gender_df_f.shape[0], '\n')
print('Percentage of rejection of Males out of total rejections is', str(round(gender_df_m.shape[0] / gender_tot * 100, 2)) + '%.')
print('Percentage of rejection of Females out of total rejections is', str(round(gender_df_f.shape[0] / gender_tot * 100, 2)) + '%.', '\n', '\n')
print('There are ' + str(gender_eleg) + ' eligible applicants.')
print('Out of this:-')
print('Males are', gender_df_m_eleg.shape[0])
print('Females are', gender_df_f_eleg.shape[0], '\n')
print('Percentage of eligible of Males out of total eligible applicants is', str(round(gender_df_m_eleg.shape[0] / gender_eleg * 100, 2)) + '%.')
print('Percentage of eligible of Females out of total eligible applicants is', str(round(gender_df_f_eleg.shape[0] / gender_eleg * 100, 2)) + '%.')
There are 4291 rejected applicants. Out of this:- Males are 1533 Females are 2758 Percentage of rejection of Males out of total rejections is 35.73%. Percentage of rejection of Females out of total rejections is 64.27%. There are 32166 eligible applicants. Out of this:- Males are 10494 Females are 21672 Percentage of eligible of Males out of total eligible applicants is 32.62%. Percentage of eligible of Females out of total eligible applicants is 67.38%.
# Total reject percentage out of 36457 records
tot_gen_rejects_perc = binary_df["status"].sum() / round(len(binary_df["status"])) * 100
print(str(round(tot_gen_rejects_perc, 2)) + '%')
11.77%
# Total reject percentage of Males out of 36457 records
tot_gen_rej_counts_m = round((gender_df_m.shape[0] / (len(binary_df))) * 100, 2)
print(str(tot_gen_rej_counts_m) + '%')
4.2%
# Total reject percentage of Females out of 36457 records
tot_gen_rej_counts_f = round((gender_df_f.shape[0] / (len(binary_df))) * 100, 2)
print(str(tot_gen_rej_counts_f) + '%')
7.57%
# Create a new dataframe of just gender and then add status to it
# Also replace 'M's and 'F's in gender with '1's and '0's
gender_tot_df = ['code_gender']
gender_perc = binary_df[gender_tot_df + ['status']] .replace('M', 1).replace('F', 0)
gender_perc.head()
| code_gender | status | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 1 |
| 2 | 0 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 0 |
gender_perc.value_counts()
code_gender status 0 0 21672 1 0 10494 0 1 2758 1 1 1533 dtype: int64
dict_list = []
for code_gender in gender_tot_df:
for one_type in [0, 1]:
dict_list.append({'feature': code_gender,
'type': one_type,
'reject_rate_percentage': round(len(gender_perc[gender_perc[code_gender] == one_type][gender_perc.status == 1])
/ len(gender_perc[gender_perc[code_gender] == one_type]) * 100, 2),
'count': len(gender_perc[gender_perc[code_gender] == one_type]),
'reject_count': len(gender_perc[gender_perc[code_gender] == one_type][gender_perc.status == 1])
})
gender_binary = pd.DataFrame.from_dict(dict_list)
gender_binary
| feature | type | reject_rate_percentage | count | reject_count | |
|---|---|---|---|---|---|
| 0 | code_gender | 0 | 11.29 | 24430 | 2758 |
| 1 | code_gender | 1 | 12.75 | 12027 | 1533 |
plt.subplots(figsize = (12, 12))
sns.barplot(x = "feature", y = "reject_rate_percentage", hue = "type", data = gender_binary)
plt.show()
Reject rate as per same gender:-
It means that Males rejection percentage out of the total Male applicants is 12.75%.
Total Female applicants are 24430 and out if them 2758 applicants are rejected.
It means that Females rejection percentage out of the total Female applicants is 11.29%.
Therefore, Males are more vulnerable than Feales w.r.t rejection.
Rejection rate as per rejected applicants:-
And percentage-wise it is 11.77%.
Females rejection percentage out of the total rejected applicants is 64.27%.
Whereas Males rejection percentage out of the total rejected applicants is 35.73%.
Here, the Females rejection rate is HIGHER than that of the Males.
According to total records of 36457 applicants:-
Males rejection percentage is 4.2%
Again, we can see that Females rejection rate is higher than that of the Males.
We can clearly see that the REJECTION RATE OF FEMALES is HIGHER than the MALES on 2 counts out of the 3.
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# Convert the categories of 'code_gender' variable back from 'M's and 'F's to '1's and '0's
# Where Male = M = 1 and
# Female = F = 0
binary_df['code_gender'] = binary_df['code_gender'].replace('M', 1).replace('F', 0)
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.shape
(36457, 17)
binary_df.flag_own_car.value_counts()
N 22614 Y 13843 Name: flag_own_car, dtype: int64
# Total Yes and No own_car distribution
plt.subplots(figsize = (8, 8))
sns.countplot(binary_df['flag_own_car'])
<AxesSubplot:xlabel='flag_own_car', ylabel='count'>
# Use crosstabs
pd.crosstab(binary_df['flag_own_car'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_car | |||
| N | 19892 | 2722 | 22614 |
| Y | 12274 | 1569 | 13843 |
| All | 32166 | 4291 | 36457 |
# Car ownership rejection break-up
car_rej_perc = (binary_df.groupby('flag_own_car')['status'].value_counts(normalize = True).reset_index(name = 'perc'))
car_rej_perc
| flag_own_car | status | perc | |
|---|---|---|---|
| 0 | N | 0 | 0.88 |
| 1 | N | 1 | 0.12 |
| 2 | Y | 0 | 0.89 |
| 3 | Y | 1 | 0.11 |
# Car ownership break-up of total applicants
car_rej_tot_perc = binary_df.flag_own_car.value_counts(normalize = True).reset_index(name = 'perc')
car_rej_tot_perc
| index | perc | |
|---|---|---|
| 0 | N | 0.62 |
| 1 | Y | 0.38 |
Insights:-
# Car ownership distribution on the basis of Good applicants only
# Count
#status0_car_c = status0.code_gender.value_counts()
#status0_car_c
status0_car_c = binary_df.loc[binary_df["status"] == 0] #.code_gender.value_counts()
status0_car_c.flag_own_car.value_counts()
N 19892 Y 12274 Name: flag_own_car, dtype: int64
# Percentage
status0_car_p = binary_df.loc[binary_df["status"] == 0]
status0_car_p.flag_own_car.value_counts(normalize = True)
N 0.62 Y 0.38 Name: flag_own_car, dtype: float64
# Car ownership distribution on the basis of Bad applicants only
# Count
status1_car_c = binary_df.loc[binary_df["status"] == 1] #.code_gender.value_counts()
status1_car_c.flag_own_car.value_counts()
N 2722 Y 1569 Name: flag_own_car, dtype: int64
# Percentage
status1_car_p = binary_df.loc[binary_df["status"] == 1]
status1_car_p.flag_own_car.value_counts(normalize = True)
N 0.63 Y 0.37 Name: flag_own_car, dtype: float64
plt.figure(figsize = (18, 20))
plt.subplot(221)
sns.countplot(x = 'status', hue = 'flag_own_car', data = status0, palette = 'Set2')
plt.title("Car Ownership in Good Applicants\n")
plt.subplot(222)
sns.countplot(x = 'status', hue = 'flag_own_car', data = status1, palette = 'Set2')
plt.title("Car Ownership in Bad Applicants\n")
plt.show()
Insights:-
# Find the applicants count who don't own a car w.r.t. status
own_car_st_count = binary_df.groupby(["flag_own_car"])["status"].value_counts(normalize = False).reset_index(name = 'count')
own_car_st_count
| flag_own_car | status | count | |
|---|---|---|---|
| 0 | N | 0 | 19892 |
| 1 | N | 1 | 2722 |
| 2 | Y | 0 | 12274 |
| 3 | Y | 1 | 1569 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_own_car", y = "count", hue = "status", data = own_car_st_count)
plt.show()
# Find the applicants count who don't own a car w.r.t. status
own_car_st_perc = binary_df.groupby(["flag_own_car"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
own_car_st_perc
| flag_own_car | status | perc | |
|---|---|---|---|
| 0 | N | 0 | 0.88 |
| 1 | N | 1 | 0.12 |
| 2 | Y | 0 | 0.89 |
| 3 | Y | 1 | 0.11 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_own_car", y = "perc", hue = "status", data = own_car_st_perc)
plt.show()
Analysis:-
* 12.03% of the applicants who don't own the car are rejected.
* 11.33% of the applicants who own the car are rejected.
# Find the applicants count who don't own a car w.r.t. gender
own_car_count = binary_df.groupby(["flag_own_car"])["code_gender"].value_counts(normalize = False).reset_index(name = 'count')
own_car_count
| flag_own_car | level_1 | count | |
|---|---|---|---|
| 0 | N | 0 | 18160 |
| 1 | N | 1 | 4454 |
| 2 | Y | 1 | 7573 |
| 3 | Y | 0 | 6270 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_own_car", y = "count", hue = "level_1", data = own_car_count)
plt.show()
# Find the applicants percentage who don't own a car w.r.t. gender
own_car_perc = binary_df.groupby(["flag_own_car"])["code_gender"].value_counts(normalize = True).reset_index(name = 'perc')
own_car_perc
| flag_own_car | level_1 | perc | |
|---|---|---|---|
| 0 | N | 0 | 0.80 |
| 1 | N | 1 | 0.20 |
| 2 | Y | 1 | 0.55 |
| 3 | Y | 0 | 0.45 |
plt.subplots(figsize = (8,8))
sns.barplot(x = "flag_own_car", y = "perc", hue = "level_1", data = own_car_perc)
plt.show()
Analysis:-
* Out of 22614 applicants who don't own a car - 80.30% are Females and 19.70% are Males
* Similarly, out of 13843 applicants who own a car = 45.29% are Females and 54.70% are Males
# Find the applicants count who don't own a car w.r.t. status
own_car_gen_count = binary_df.groupby(["flag_own_car", 'code_gender'])["status"].value_counts(normalize = False).reset_index(name = 'count')
own_car_gen_count
| flag_own_car | code_gender | status | count | |
|---|---|---|---|---|
| 0 | N | 0 | 0 | 16059 |
| 1 | N | 0 | 1 | 2101 |
| 2 | N | 1 | 0 | 3833 |
| 3 | N | 1 | 1 | 621 |
| 4 | Y | 0 | 0 | 5613 |
| 5 | Y | 0 | 1 | 657 |
| 6 | Y | 1 | 0 | 6661 |
| 7 | Y | 1 | 1 | 912 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_own_car", y = "count", hue = "code_gender", data = own_car_gen_count)
plt.show()
# Find the applicants percentage who don't own a car w.r.t. gender
own_car_gen_perc = binary_df.groupby(["flag_own_car", 'code_gender'])["status"].value_counts(normalize = True).reset_index(name = 'perc')
own_car_gen_perc
| flag_own_car | code_gender | status | perc | |
|---|---|---|---|---|
| 0 | N | 0 | 0 | 0.88 |
| 1 | N | 0 | 1 | 0.12 |
| 2 | N | 1 | 0 | 0.86 |
| 3 | N | 1 | 1 | 0.14 |
| 4 | Y | 0 | 0 | 0.90 |
| 5 | Y | 0 | 1 | 0.10 |
| 6 | Y | 1 | 0 | 0.88 |
| 7 | Y | 1 | 1 | 0.12 |
Analysis:-
Applicants who don't own a car:
* 11.56% of Females who don't own a car are rejected.
* 13.94% of Males who don't own a car are rejected.
Applicants who own a car:
* 10.47% of Females who own a car are rejected.
* 12.04% of applicants who own a car are rejected.
# Check the status count of rejection and acceptance on the basis of own_car
own_car_df = binary_df.groupby(["flag_own_car", 'status'])["status"].count()
own_car_df
flag_own_car status
N 0 19892
1 2722
Y 0 12274
1 1569
Name: status, dtype: int64
# Total rejection count of applicants who don't own a car (N)
own_car_df_n = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_own_car == 'N']
own_car_df_n.shape[0]
2722
# Total rejection count of applicants who own a car (Y)
own_car_df_y = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_own_car == 'Y']
own_car_df_y.shape[0]
1569
# Total rejections
own_car_tot = own_car_df_n.shape[0] + own_car_df_y.shape[0]
own_car_tot
4291
# Total eligibles
own_car_df_n_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_own_car == 'N']
print("Total Eligible with No Car: " + str(own_car_df_n_eleg.shape[0]))
own_car_df_y_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_own_car == 'Y']
print("Total Eligible with a Car: " + str(own_car_df_y_eleg.shape[0]))
own_car_eleg = own_car_df_n_eleg.shape[0] + own_car_df_y_eleg.shape[0]
print("Total Eligible applicants : " + str(own_car_eleg))
Total Eligible with No Car: 19892 Total Eligible with a Car: 12274 Total Eligible applicants : 32166
# Percencatage of rejection of applicants with or without a car out of total rejections
print('There are ' + str(own_car_tot) + ' rejected applicants.')
print('Out of this:-')
print('Applicants without a car are', own_car_df_n.shape[0])
print('Applicants with a car are', own_car_df_y.shape[0], '\n')
print('Percentage of rejection of applicants without a car out of total rejections is',
str(round(own_car_df_n.shape[0] / own_car_tot * 100, 2)) + '%.')
print('Percentage of rejection of applicants with a car out of total rejections is',
str(round(own_car_df_y.shape[0] / own_car_tot * 100, 2)) + '%.', '\n', '\n')
print('There are ' + str(own_car_eleg) + ' eligible applicants.')
print('Out of this:-')
print('Applicants without a car are', own_car_df_n_eleg.shape[0])
print('Applicants with a car are', own_car_df_y_eleg.shape[0], '\n')
print('Percentage of applicants without a car out of total eligible applicants is', str(round(own_car_df_n_eleg.shape[0] / own_car_eleg * 100, 2)) + '%.')
print('Percentage of applicants with a car out of total eligible applicants is', str(round(own_car_df_y_eleg.shape[0] / own_car_eleg * 100, 2)) + '%.')
There are 4291 rejected applicants. Out of this:- Applicants without a car are 2722 Applicants with a car are 1569 Percentage of rejection of applicants without a car out of total rejections is 63.44%. Percentage of rejection of applicants with a car out of total rejections is 36.56%. There are 32166 eligible applicants. Out of this:- Applicants without a car are 19892 Applicants with a car are 12274 Percentage of applicants without a car out of total eligible applicants is 61.84%. Percentage of applicants with a car out of total eligible applicants is 38.16%.
pd.crosstab(binary_df['flag_own_car'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_car | |||
| N | 19892 | 2722 | 22614 |
| Y | 12274 | 1569 | 13843 |
| All | 32166 | 4291 | 36457 |
# Create a new dataframe of just own_car and then add status to it
own_car_tot_df = ['flag_own_car']
own_car_perc = binary_df[own_car_tot_df + ['status']].replace('Y', 1).replace('N', 0)
own_car_perc.head()
| flag_own_car | status | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 1 |
| 2 | 0 | 0 |
| 3 | 1 | 0 |
| 4 | 1 | 0 |
own_car_perc.value_counts()
flag_own_car status 0 0 19892 1 0 12274 0 1 2722 1 1 1569 dtype: int64
dict_list = []
for flag_own_car in own_car_tot_df:
for one_type in [0, 1]:
dict_list.append({'feature': flag_own_car,
'type': one_type,
'reject_rate_percentage': round(len(own_car_perc[own_car_perc[flag_own_car] == one_type]
[own_car_perc.status == 1])
/ len(own_car_perc[own_car_perc[flag_own_car] == one_type]) * 100, 2),
'count': len(own_car_perc[own_car_perc[flag_own_car] == one_type]),
'reject_count': len(own_car_perc[own_car_perc[flag_own_car] == one_type][own_car_perc.status == 1])
})
own_car_binary = pd.DataFrame.from_dict(dict_list)
own_car_binary
| feature | type | reject_rate_percentage | count | reject_count | |
|---|---|---|---|---|---|
| 0 | flag_own_car | 0 | 12.04 | 22614 | 2722 |
| 1 | flag_own_car | 1 | 11.33 | 13843 | 1569 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "feature", y = "reject_rate_percentage", hue = "type", data = own_car_binary)
plt.show()
Percentage as per not owning a car:-
It means that 62.03% of the applicants don't own a car.
Total applicants are 36457 and out of them 13843 applicants own a car.
It means that 37.97% of the applicants own a car.
Percentage of applicants without a car is HIGHER than those who own a car.
Rejection rate as per car status:-
13843 applicants who own the car - 11.33% of the applicants are rejected.
Rejection rate of of applicants without a car is slightly HIGHER than those with a car.
Percentage of car ownership status as per gender:-
Total applicants who own the car, out of it 54.71% are the Males.
Males have the highest ownership of cars in comparison to Females.
Rejection rate as per the car status on gender basis:-
Own the car:-
Here Males have HIGHER rejection rate as compared to Females who don't own the car.
Rejection rate as per rejected applicants:-
And percentage-wise it is 11.77%.
Applicants rejection percentage out of the total rejected applicants owning no car is 63.44%.
Whereas applicants rejection percentage out of the total rejected applicants owning a car is 36.56%.
Here, the rejection rate of applicants without a car is HIGHER than that of the applicants owning a car.
We can clearly see that the REJECTION RATE OF APPLICANTS is HIGHER if they don't own a car and this impact the Males more than the Females.
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# Convert the categories of 'flag_own_car' variable back from 'Y's and 'N's to '1's and '0's
# Where Y = 1 and
# N = 0
binary_df['flag_own_car'] = binary_df['flag_own_car'].replace('Y', 1).replace('N', 0)
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.flag_own_realty.value_counts()
Y 24506 N 11951 Name: flag_own_realty, dtype: int64
Analysis:-
# Total Yes and No own_realty distribution
plt.subplots(figsize = (8, 8))
sns.countplot(binary_df['flag_own_realty'])
<AxesSubplot:xlabel='flag_own_realty', ylabel='count'>
# Use crosstabs
pd.crosstab(binary_df['flag_own_realty'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_realty | |||
| N | 10390 | 1561 | 11951 |
| Y | 21776 | 2730 | 24506 |
| All | 32166 | 4291 | 36457 |
# Find the applicants count who don't own a property w.r.t. status
own_prop_st_count = binary_df.groupby(["flag_own_realty"])["status"].value_counts(normalize = False).reset_index(name = 'count')
own_prop_st_count
| flag_own_realty | status | count | |
|---|---|---|---|
| 0 | N | 0 | 10390 |
| 1 | N | 1 | 1561 |
| 2 | Y | 0 | 21776 |
| 3 | Y | 1 | 2730 |
Analysis:-
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_own_realty", y = "count", hue = "status", data = own_prop_st_count)
plt.show()
# Find the applicants percentage who don't own a car w.r.t. status
own_prop_st_perc = binary_df.groupby(["flag_own_realty"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
own_prop_st_perc
| flag_own_realty | status | perc | |
|---|---|---|---|
| 0 | N | 0 | 0.87 |
| 1 | N | 1 | 0.13 |
| 2 | Y | 0 | 0.89 |
| 3 | Y | 1 | 0.11 |
Analysis:-
11.14% of the applicants with property are rejected.
The difference between the 2 is merely 1.92%.
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_own_realty", y = "perc", hue = "status", data = own_prop_st_perc)
plt.show()
# Find the applicants count who don't own a car w.r.t. gender
own_prop_gen_count = binary_df.groupby(["flag_own_realty"])["code_gender"].value_counts(normalize = False).reset_index(name = 'count')
own_prop_gen_count
| flag_own_realty | level_1 | count | |
|---|---|---|---|
| 0 | N | 0 | 7600 |
| 1 | N | 1 | 4351 |
| 2 | Y | 0 | 16830 |
| 3 | Y | 1 | 7676 |
# Find the applicants percentage who don't own a car w.r.t. gender
own_prop_gen_perc = binary_df.groupby(["flag_own_realty"])["code_gender"].value_counts(normalize = True).reset_index(name = 'perc')
own_prop_gen_perc
| flag_own_realty | level_1 | perc | |
|---|---|---|---|
| 0 | N | 0 | 0.64 |
| 1 | N | 1 | 0.36 |
| 2 | Y | 0 | 0.69 |
| 3 | Y | 1 | 0.31 |
# Find the applicants count who don't own a property w.r.t. gender and rejected as per the status
own_prop_count = binary_df.groupby(["flag_own_realty", 'code_gender'])["status"].value_counts(normalize = False).reset_index(name = 'count')
own_prop_count
| flag_own_realty | code_gender | status | count | |
|---|---|---|---|---|
| 0 | N | 0 | 0 | 6650 |
| 1 | N | 0 | 1 | 950 |
| 2 | N | 1 | 0 | 3740 |
| 3 | N | 1 | 1 | 611 |
| 4 | Y | 0 | 0 | 15022 |
| 5 | Y | 0 | 1 | 1808 |
| 6 | Y | 1 | 0 | 6754 |
| 7 | Y | 1 | 1 | 922 |
Analysis:-
611 Male applicants without property are rejected.
1808 Female applicants with property are rejected.
# Find the applicants percentage who don't own a property w.r.t. gender and rejected as per the status
own_prop_perc = binary_df.groupby(["flag_own_realty", 'code_gender'])["status"].value_counts(normalize = True).reset_index(name = 'perc')
own_prop_perc
| flag_own_realty | code_gender | status | perc | |
|---|---|---|---|---|
| 0 | N | 0 | 0 | 0.88 |
| 1 | N | 0 | 1 | 0.12 |
| 2 | N | 1 | 0 | 0.86 |
| 3 | N | 1 | 1 | 0.14 |
| 4 | Y | 0 | 0 | 0.89 |
| 5 | Y | 0 | 1 | 0.11 |
| 6 | Y | 1 | 0 | 0.88 |
| 7 | Y | 1 | 1 | 0.12 |
Analysis:-
14.04% of Male applicants without property are rejected.
10.74% of Female applicants with property are rejected.
12.01% of Male applicants with property are rejected.
There is a HIGHER rejection rate in case of male applicants as compared to female applicants. But having a property reduces the rejection rate by 2% approx in both the male and female applicants.
# Check the status count of rejection and acceptance on the basis of own_property
own_prop_df = binary_df.groupby(["flag_own_realty", 'status'])["status"].count()
own_prop_df
flag_own_realty status
N 0 10390
1 1561
Y 0 21776
1 2730
Name: status, dtype: int64
# Total rejection count of applicants who don't own a property (N)
own_prop_df_n = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_own_realty == 'N']
own_prop_df_n.shape[0]
1561
# Total rejection count of applicants who own a property (Y)
own_prop_df_y = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_own_realty == 'Y']
own_prop_df_y.shape[0]
2730
# Total rejections
own_prop_tot = own_prop_df_n.shape[0] + own_prop_df_y.shape[0]
own_prop_tot
4291
# Total eligibles
own_prop_df_n_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_own_realty == 'N']
print("Total Eligible without a property: " + str(own_prop_df_n_eleg.shape[0]))
own_prop_df_y_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_own_realty == 'Y']
print("Total Eligible with a property: " + str(own_prop_df_y_eleg.shape[0]))
own_prop_eleg = own_prop_df_n_eleg.shape[0] + own_prop_df_y_eleg.shape[0]
print("Total Eligible applicants : " + str(own_prop_eleg))
Total Eligible without a property: 10390 Total Eligible with a property: 21776 Total Eligible applicants : 32166
# Percencatage of rejection of applicants with or without a property out of total rejections
print('There are ' + str(own_prop_tot) + ' rejected applicants.')
print('Out of this:-')
print('Applicants without a property are', own_prop_df_n.shape[0])
print('Applicants with a property are', own_prop_df_y.shape[0], '\n')
print('Percentage of rejection of applicants without a property out of total rejections is',
str(round(own_prop_df_n.shape[0]/own_prop_tot * 100, 2)) + '%.')
print('Percentage of rejection of applicants with a property out of total rejections is',
str(round(own_prop_df_y.shape[0]/own_prop_tot * 100, 2)) + '%.', '\n', '\n')
print('There are ' + str(own_prop_eleg) + ' eligible applicants.')
print('Out of this:-')
print('Applicants without a property are', own_prop_df_n_eleg.shape[0])
print('Applicants with a property are', own_prop_df_y_eleg.shape[0], '\n')
print('Percentage of eligibility of applicants without a property out of total eligibles is', str(round(own_prop_df_n_eleg.shape[0] / own_prop_eleg * 100, 2)) + '%.')
print('Percentage of eligibility of applicants with a property out of total eligibles is', str(round(own_prop_df_y_eleg.shape[0] / own_prop_eleg * 100, 2)) + '%.')
There are 4291 rejected applicants. Out of this:- Applicants without a property are 1561 Applicants with a property are 2730 Percentage of rejection of applicants without a property out of total rejections is 36.38%. Percentage of rejection of applicants with a property out of total rejections is 63.62%. There are 32166 eligible applicants. Out of this:- Applicants without a property are 10390 Applicants with a property are 21776 Percentage of eligibility of applicants without a property out of total eligibles is 32.3%. Percentage of eligibility of applicants with a property out of total eligibles is 67.7%.
Analysis:-
* A strange thing to note from the above observation is that applicants who own a property consists of 63.62% of the rejections out of the total rejections count.
pd.crosstab(binary_df['flag_own_realty'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_realty | |||
| N | 10390 | 1561 | 11951 |
| Y | 21776 | 2730 | 24506 |
| All | 32166 | 4291 | 36457 |
# Create a new dataframe of just own_property and then add status to it
# Also replace 'Y's and 'N's with '1's and '0's in the own_property column
own_prop_tot_df = ['flag_own_realty']
own_prop_perc = binary_df[own_prop_tot_df + ['status']].replace('Y', 1).replace('N', 0)
own_prop_perc.head()
| flag_own_realty | status | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 1 |
| 2 | 1 | 0 |
| 3 | 1 | 0 |
| 4 | 1 | 0 |
own_prop_perc.value_counts()
flag_own_realty status 1 0 21776 0 0 10390 1 1 2730 0 1 1561 dtype: int64
dict_list = []
for flag_own_realty in own_prop_tot_df:
for one_type in [0, 1]:
dict_list.append({'feature': flag_own_realty,
'type': one_type,
'reject_rate_percentage': round(len(own_prop_perc[own_prop_perc[flag_own_realty] == one_type]
[own_prop_perc.status == 1])
/ len(own_prop_perc[own_prop_perc[flag_own_realty] == one_type]) * 100, 2),
'count': len(own_prop_perc[own_prop_perc[flag_own_realty] == one_type]),
'reject_count': len(own_prop_perc[own_prop_perc[flag_own_realty] == one_type][own_prop_perc.status == 1])
})
own_prop_binary = pd.DataFrame.from_dict(dict_list)
own_prop_binary
| feature | type | reject_rate_percentage | count | reject_count | |
|---|---|---|---|---|---|
| 0 | flag_own_realty | 0 | 13.06 | 11951 | 1561 |
| 1 | flag_own_realty | 1 | 11.14 | 24506 | 2730 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "feature", y = "reject_rate_percentage", hue = "type", data = own_prop_binary)
plt.show()
Percentage as per not owning a property:-
It means that 32.78% of the applicants don't own a property.
Total applicants are 36457 and out of them 24506 applicants own a property.
It means that 67.22% of the applicants own a property.
Percentage of applicants with a property is HIGHER than those who don't own a property.
Rejection rate as per property status:-
24506 applicants who own the property - 11.14% of the applicants are rejected.
Rejection rate of the applicants without a property is HIGHER than those with a property.
Percentage of property ownership status as per gender:-
Total applicants who own the property, out of it 31.32% are the Males.
Females have the highest ownership of property in comparison to Males.
Rejection rate as per the property status on gender basis:-
Own the property:-
Here Males have HIGHER rejection rate as compared to Females who don't own the property.
Rejection rate as per rejected applicants:-
And percentage-wise it is 11.77%.
Applicants rejection percentage out of the total rejected applicants owning no property is 36.38%.
Whereas applicants rejection percentage out of the total rejected applicants owning a property is 63.62%.
Here, the rejection rate of applicants with a property is HIGHER than that of the applicants not owning a property.
We can clearly see that the REJECTION RATE OF APPLICANTS is HIGHER if they own a property and this impact the Males more than the Females.
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# Convert the categories of 'flag_own_realty' variable back from 'Y's and 'N's to '1's and '0's
# Where Y = 1 and
# N = 0
binary_df['flag_own_realty'] = binary_df['flag_own_realty'].replace('Y', 1).replace('N', 0)
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.flag_work_phone.value_counts()
0 28235 1 8222 Name: flag_work_phone, dtype: int64
binary_df.flag_work_phone.value_counts(normalize = True)
0 0.77 1 0.23 Name: flag_work_phone, dtype: float64
Analysis:-
# Total Yes and No own_car distribution
plt.subplots(figsize = (8, 8))
sns.countplot(binary_df['flag_work_phone'])
<AxesSubplot:xlabel='flag_work_phone', ylabel='count'>
# Use crosstabs
pd.crosstab(binary_df['flag_work_phone'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_work_phone | |||
| 0 | 24911 | 3324 | 28235 |
| 1 | 7255 | 967 | 8222 |
| All | 32166 | 4291 | 36457 |
# Find the applicants count who don't own a work phone w.r.t. status
wp_st_count = binary_df.groupby(["flag_work_phone"])["status"].value_counts(normalize = False).reset_index(name = 'count')
wp_st_count
| flag_work_phone | status | count | |
|---|---|---|---|
| 0 | 0 | 0 | 24911 |
| 1 | 0 | 1 | 3324 |
| 2 | 1 | 0 | 7255 |
| 3 | 1 | 1 | 967 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_work_phone", y = "count", hue = "status", data = wp_st_count)
plt.show()
# Find the applicants percentage who don't own a work phone w.r.t. status
wp_st_perc = binary_df.groupby(["flag_work_phone"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
wp_st_perc
| flag_work_phone | status | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.88 |
| 1 | 0 | 1 | 0.12 |
| 2 | 1 | 0 | 0.88 |
| 3 | 1 | 1 | 0.12 |
Analysis:-
* 11.77% of applicants are rejected for not having a work phone.
* 11.76% of applicants are rejected for having a work phone.
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_work_phone", y = "perc", hue = "status", data = wp_st_perc)
plt.show()
# Find the applicants count who don't own a work phone w.r.t. gender
wp_count = binary_df.groupby(["flag_work_phone"])["code_gender"].value_counts(normalize = False).reset_index(name = 'count')
wp_count
| flag_work_phone | level_1 | count | |
|---|---|---|---|
| 0 | 0 | 0 | 19386 |
| 1 | 0 | 1 | 8849 |
| 2 | 1 | 0 | 5044 |
| 3 | 1 | 1 | 3178 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_work_phone", y = "count", hue = "level_1", data = wp_count)
plt.show()
# Find the applicants percentage who don't own a work phone w.r.t. gender
wp_perc = binary_df.groupby(["flag_work_phone"])["code_gender"].value_counts(normalize = True).reset_index(name = 'perc')
wp_perc
| flag_work_phone | level_1 | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.69 |
| 1 | 0 | 1 | 0.31 |
| 2 | 1 | 0 | 0.61 |
| 3 | 1 | 1 | 0.39 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_work_phone", y = "perc", hue = "level_1", data = wp_perc)
plt.show()
Analysis:-
* Out of 28235 applicants who don't own a work phone - 68.66% are Females and 31.34% are Males
* Similarly, out of 8222 applicants who own a work phone = 61.35% are Females and 38.65% are Males
# Find the applicants count who don't own a work phone w.r.t. status and gender
wp_gen_count = binary_df.groupby(["flag_work_phone", 'code_gender'])["status"].value_counts(normalize = False).reset_index(name = 'count')
wp_gen_count
| flag_work_phone | code_gender | status | count | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 17201 |
| 1 | 0 | 0 | 1 | 2185 |
| 2 | 0 | 1 | 0 | 7710 |
| 3 | 0 | 1 | 1 | 1139 |
| 4 | 1 | 0 | 0 | 4471 |
| 5 | 1 | 0 | 1 | 573 |
| 6 | 1 | 1 | 0 | 2784 |
| 7 | 1 | 1 | 1 | 394 |
# Find the applicants percentage who don't own a work phone w.r.t. status and gender
wp_gen_perc = binary_df.groupby(["flag_work_phone", 'code_gender'])["status"].value_counts(normalize = True).reset_index(name = 'perc')
wp_gen_perc
| flag_work_phone | code_gender | status | perc | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0.89 |
| 1 | 0 | 0 | 1 | 0.11 |
| 2 | 0 | 1 | 0 | 0.87 |
| 3 | 0 | 1 | 1 | 0.13 |
| 4 | 1 | 0 | 0 | 0.89 |
| 5 | 1 | 0 | 1 | 0.11 |
| 6 | 1 | 1 | 0 | 0.88 |
| 7 | 1 | 1 | 1 | 0.12 |
Analysis:-
Applicants who don't own a work phone:
* 11.27% of Females who don't own a work phone are rejected.
* 12.87% of Males who don't own a work phone are rejected.
Applicants who own a work phone:
* 11.36% of Females who own a work phone are rejected.
* 12.40% of applicants who own a work phone are rejected.
# Check the status count of rejection and acceptance on the basis of work_phone
wp_df = binary_df.groupby(["flag_work_phone", 'status'])["status"].count()
wp_df
flag_work_phone status
0 0 24911
1 3324
1 0 7255
1 967
Name: status, dtype: int64
# Total rejection count of applicants who don't own a work_phone (N)
wp_df_n = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_work_phone == 0]
wp_df_n.shape[0]
3324
# Total rejection count of applicants who own a work_phone (Y)
wp_df_y = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_work_phone == 1]
wp_df_y.shape[0]
967
# Total rejections
wp_tot = wp_df_n.shape[0] + wp_df_y.shape[0]
wp_tot
4291
# Total eligibles
wp_df_n_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_work_phone == 0]
print("Total Eligible applicants without a work phone: " + str(wp_df_n_eleg.shape[0]))
wp_df_y_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_work_phone == 1]
print("Total Eligible applicants with a work phone: " + str(wp_df_y_eleg.shape[0]))
wp_eleg = wp_df_n_eleg.shape[0] + wp_df_y_eleg.shape[0]
print("Total Eligible applicants : " + str(wp_eleg))
Total Eligible applicants without a work phone: 24911 Total Eligible applicants with a work phone: 7255 Total Eligible applicants : 32166
# Percencatage of rejection of applicants with or without a work phone out of total rejections
print('There are ' + str(wp_tot) + ' rejected applicants.')
print('Out of this:-')
print('Applicants without a work phone are', wp_df_n.shape[0])
print('Applicants with a work phone are', wp_df_y.shape[0], '\n')
print('Percentage of rejection of applicants without a work phone out of total rejections is',
str(round(wp_df_n.shape[0]/wp_tot * 100, 2)) + '%.')
print('Percentage of rejection of applicants with a work phone out of total rejections is',
str(round(wp_df_y.shape[0]/wp_tot * 100, 2)) + '%.', '\n', '\n')
print('There are ' + str(wp_eleg) + ' eligible applicants.')
print('Out of this:-')
print('Applicants without a work phone are', wp_df_n_eleg.shape[0])
print('Applicants with a work phone are', wp_df_y_eleg.shape[0], '\n')
print('Percentage of eligibility of applicants without a work phone out of total eligibles is', str(round(wp_df_n_eleg.shape[0] / wp_eleg * 100, 2)) + '%.')
print('Percentage of eligibility of applicants with a work phone out of total eligibles is', str(round(wp_df_y_eleg.shape[0] / wp_eleg * 100, 2)) + '%.')
There are 4291 rejected applicants. Out of this:- Applicants without a work phone are 3324 Applicants with a work phone are 967 Percentage of rejection of applicants without a work phone out of total rejections is 77.46%. Percentage of rejection of applicants with a work phone out of total rejections is 22.54%. There are 32166 eligible applicants. Out of this:- Applicants without a work phone are 24911 Applicants with a work phone are 7255 Percentage of eligibility of applicants without a work phone out of total eligibles is 77.45%. Percentage of eligibility of applicants with a work phone out of total eligibles is 22.55%.
pd.crosstab(binary_df['flag_work_phone'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_work_phone | |||
| 0 | 24911 | 3324 | 28235 |
| 1 | 7255 | 967 | 8222 |
| All | 32166 | 4291 | 36457 |
# Create a new dataframe of just work_phone and then add status to it
wp_tot_df = ['flag_work_phone']
wp_df_perc = binary_df[wp_tot_df + ['status']]
wp_df_perc.head()
| flag_work_phone | status | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 1 | 1 |
| 2 | 1 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 0 |
wp_df_perc.value_counts()
flag_work_phone status 0 0 24911 1 0 7255 0 1 3324 1 1 967 dtype: int64
dict_list = []
for flag_work_phone in wp_tot_df:
for one_type in [0, 1]:
dict_list.append({'feature': flag_work_phone,
'type': one_type,
'reject_rate_percentage': round(len(wp_df_perc[wp_df_perc[flag_work_phone] == one_type]
[wp_df_perc.status == 1])
/ len(wp_df_perc[wp_df_perc[flag_work_phone] == one_type]) * 100, 2),
'count': len(wp_df_perc[wp_df_perc[flag_work_phone] == one_type]),
'reject_count': len(wp_df_perc[wp_df_perc[flag_work_phone] == one_type][wp_df_perc.status == 1])
})
wp_binary = pd.DataFrame.from_dict(dict_list)
wp_binary
| feature | type | reject_rate_percentage | count | reject_count | |
|---|---|---|---|---|---|
| 0 | flag_work_phone | 0 | 11.77 | 28235 | 3324 |
| 1 | flag_work_phone | 1 | 11.76 | 8222 | 967 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "feature", y = "reject_rate_percentage", hue = "type", data = wp_binary)
plt.show()
Percentage as per not owning a work phone:-
It means that 77.44% of the applicants don't own a work phone.
Total applicants are 36457 and out of them 8222 applicants own a work phone.
It means that 22.56% of the applicants own a work phone.
Percentage of applicants without a work phone is HIGHER than those who own a work phone.
Rejection rate as per work phone status:-
8222 applicants who own the work phone - 11.76% of the applicants are rejected.
Rejection rate of the applicants with or without a work phone is exactly the same.
Percentage of work phone ownership status as per gender:-
Total applicants who own the work phone, out of it 38.65% are the Males.
Females have the highest ownership of work phone in comparison to Males.
Rejection rate as per the work phone status on gender basis:-
Own the work phone:-
Here Males have HIGHER rejection rate as compared to Females who don't own the work phone.
Rejection rate as per rejected applicants:-
And percentage-wise it is 11.77%.
Applicants rejection percentage out of the total rejected applicants owning no work phone is 77.46%.
Whereas applicants rejection percentage out of the total rejected applicants owning a work phone is 22.54%.
Here, the rejection rate of applicants without a work phone is much HIGHER than that of the applicants owning a work phone.
Overall, we can clearly see that the REJECTION RATE OF APPLICANTS is not impacted with or without the work phone.
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.flag_phone.value_counts()
0 25709 1 10748 Name: flag_phone, dtype: int64
binary_df.flag_phone.value_counts(normalize = True)
0 0.71 1 0.29 Name: flag_phone, dtype: float64
Analysis:-
# Use crosstabs
pd.crosstab(binary_df['flag_phone'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_phone | |||
| 0 | 22649 | 3060 | 25709 |
| 1 | 9517 | 1231 | 10748 |
| All | 32166 | 4291 | 36457 |
# Total Male and Female gender distribution
plt.subplots(figsize = (8, 8))
sns.countplot(binary_df['flag_phone'])
<AxesSubplot:xlabel='flag_phone', ylabel='count'>
# Find the applicants count who don't own a phone w.r.t. status
ph_st_count = binary_df.groupby(["flag_phone"])["status"].value_counts(normalize = False).reset_index(name = 'count')
ph_st_count
| flag_phone | status | count | |
|---|---|---|---|
| 0 | 0 | 0 | 22649 |
| 1 | 0 | 1 | 3060 |
| 2 | 1 | 0 | 9517 |
| 3 | 1 | 1 | 1231 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_phone", y = "count", hue = "status", data = ph_st_count)
plt.show()
# Find the applicants percentage who don't own a phone w.r.t. status
ph_st_perc = binary_df.groupby(["flag_phone"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
ph_st_perc
| flag_phone | status | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.88 |
| 1 | 0 | 1 | 0.12 |
| 2 | 1 | 0 | 0.89 |
| 3 | 1 | 1 | 0.11 |
Analysis:-
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_phone", y = "perc", hue = "status", data = ph_st_perc)
plt.show()
# Find the applicants count who don't own a phone w.r.t. gender
ph_count = binary_df.groupby(["flag_phone"])["code_gender"].value_counts(normalize = False).reset_index(name = 'count')
ph_count
| flag_phone | level_1 | count | |
|---|---|---|---|
| 0 | 0 | 0 | 17018 |
| 1 | 0 | 1 | 8691 |
| 2 | 1 | 0 | 7412 |
| 3 | 1 | 1 | 3336 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_phone", y = "count", hue = "level_1", data = ph_count)
plt.show()
# Find the applicants percentage who don't own a phone w.r.t. gender
ph_perc = binary_df.groupby(["flag_phone"])["code_gender"].value_counts(normalize = True).reset_index(name = 'perc')
ph_perc
| flag_phone | level_1 | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.66 |
| 1 | 0 | 1 | 0.34 |
| 2 | 1 | 0 | 0.69 |
| 3 | 1 | 1 | 0.31 |
Analysis:-
# Find the applicants count who don't own a phone w.r.t. status and gender
ph_gen_count = binary_df.groupby(["flag_phone", 'code_gender'])["status"].value_counts(normalize = False).reset_index(name = 'count')
ph_gen_count
| flag_phone | code_gender | status | count | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 15048 |
| 1 | 0 | 0 | 1 | 1970 |
| 2 | 0 | 1 | 0 | 7601 |
| 3 | 0 | 1 | 1 | 1090 |
| 4 | 1 | 0 | 0 | 6624 |
| 5 | 1 | 0 | 1 | 788 |
| 6 | 1 | 1 | 0 | 2893 |
| 7 | 1 | 1 | 1 | 443 |
# Find the applicants percentage who don't own a phone w.r.t. status and gender
ph_gen_perc = binary_df.groupby(["flag_phone", 'code_gender'])["status"].value_counts(normalize = True).reset_index(name = 'perc')
ph_gen_perc
| flag_phone | code_gender | status | perc | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0.88 |
| 1 | 0 | 0 | 1 | 0.12 |
| 2 | 0 | 1 | 0 | 0.87 |
| 3 | 0 | 1 | 1 | 0.13 |
| 4 | 1 | 0 | 0 | 0.89 |
| 5 | 1 | 0 | 1 | 0.11 |
| 6 | 1 | 1 | 0 | 0.87 |
| 7 | 1 | 1 | 1 | 0.13 |
Analysis:-
Applicants who don't own a phone:
* 11.57% of Females who don't own a phone are rejected.
* 12.54% of Males who don't own a phone are rejected.
Applicants who own a phone:
* 10.63% of Females who own a phone are rejected.
* 13.27% of applicants who own a phone are rejected.
# Check the phone count of rejection and acceptance on the basis of status
ph_df = binary_df.groupby(["flag_phone", 'status'])["status"].count()
ph_df
flag_phone status
0 0 22649
1 3060
1 0 9517
1 1231
Name: status, dtype: int64
# Check the phone percentage of rejection and acceptance on the basis of status
ph_df = binary_df.groupby(["flag_phone"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
ph_df
| flag_phone | status | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.88 |
| 1 | 0 | 1 | 0.12 |
| 2 | 1 | 0 | 0.89 |
| 3 | 1 | 1 | 0.11 |
# Total rejection count of applicants who don't own a phone (N)
ph_df_n = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_phone == 0]
ph_df_n.shape[0]
3060
# Total rejection count of applicants who own a phone (Y)
ph_df_y = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_phone == 1]
ph_df_y.shape[0]
1231
# Total rejections
ph_tot = ph_df_n.shape[0] + ph_df_y.shape[0]
ph_tot
4291
# Total eligibles
ph_df_n_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_phone == 0]
print("Total Eligible without a phone: " + str(ph_df_n_eleg.shape[0]))
ph_df_y_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_phone == 1]
print("Total Eligible with a phone: " + str(ph_df_y_eleg.shape[0]))
ph_eleg = ph_df_n_eleg.shape[0] + ph_df_y_eleg.shape[0]
print("Total Eligible applicants : " + str(ph_eleg))
Total Eligible without a phone: 22649 Total Eligible with a phone: 9517 Total Eligible applicants : 32166
# Percencatage of rejection of applicants with or without a phone out of total rejections
print('There are ' + str(ph_tot) + ' rejected applicants.')
print('Out of this:-')
print('Applicants without a phone are', ph_df_n.shape[0])
print('Applicants with a phone are', ph_df_y.shape[0], '\n')
print('Percentage of rejection of applicants without a phone out of total rejections is',
str(round(ph_df_n.shape[0]/ph_tot * 100, 2)) + '%.')
print('Percentage of rejection of applicants with a phone out of total rejections is',
str(round(ph_df_y.shape[0]/ph_tot * 100, 2)) + '%.' '\n', '\n')
print('There are ' + str(ph_eleg) + ' eligible applicants.')
print('Out of this:-')
print('Applicants without a phone are', ph_df_n_eleg.shape[0])
print('Applicants with a phone are', ph_df_y_eleg.shape[0], '\n')
print('Percentage of eligibility of applicants without a phone out of total eligibles is', str(round(ph_df_n_eleg.shape[0] / ph_eleg * 100, 2)) + '%.')
print('Percentage of eligibility of applicants with a phone out of total eligibles is', str(round(ph_df_y_eleg.shape[0] / ph_eleg * 100, 2)) + '%.')
There are 4291 rejected applicants. Out of this:- Applicants without a phone are 3060 Applicants with a phone are 1231 Percentage of rejection of applicants without a phone out of total rejections is 71.31%. Percentage of rejection of applicants with a phone out of total rejections is 28.69%. There are 32166 eligible applicants. Out of this:- Applicants without a phone are 22649 Applicants with a phone are 9517 Percentage of eligibility of applicants without a phone out of total eligibles is 70.41%. Percentage of eligibility of applicants with a phone out of total eligibles is 29.59%.
pd.crosstab(binary_df['flag_phone'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_phone | |||
| 0 | 22649 | 3060 | 25709 |
| 1 | 9517 | 1231 | 10748 |
| All | 32166 | 4291 | 36457 |
# Create a new dataframe of just phone and then add status to it
ph_tot_df = ['flag_phone']
ph_df_perc = binary_df[ph_tot_df + ['status']]
ph_df_perc.head()
| flag_phone | status | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 1 |
| 2 | 0 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 0 |
ph_df_perc.value_counts()
flag_phone status 0 0 22649 1 0 9517 0 1 3060 1 1 1231 dtype: int64
dict_list = []
for flag_phone in ph_tot_df:
for one_type in [0, 1]:
dict_list.append({'feature': flag_phone,
'type': one_type,
'reject_rate_percentage': round(len(ph_df_perc[ph_df_perc[flag_phone] == one_type]
[ph_df_perc.status == 1])
/ len(ph_df_perc[ph_df_perc[flag_phone] == one_type]) * 100, 2),
'count': len(ph_df_perc[ph_df_perc[flag_phone] == one_type]),
'reject_count': len(ph_df_perc[ph_df_perc[flag_phone] == one_type][ph_df_perc.status == 1])
})
ph_binary = pd.DataFrame.from_dict(dict_list)
ph_binary
| feature | type | reject_rate_percentage | count | reject_count | |
|---|---|---|---|---|---|
| 0 | flag_phone | 0 | 11.90 | 25709 | 3060 |
| 1 | flag_phone | 1 | 11.45 | 10748 | 1231 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "feature", y = "reject_rate_percentage", hue = "type", data = ph_binary)
plt.show()
Percentage as per not owning a phone:-
It means that 70.52% of the applicants don't own a phone.
Total applicants are 36457 and out of them 10748 applicants own a phone.
It means that 29.48% of the applicants own a phone.
Percentage of applicants without a phone is HIGHER than those who own a phone.
Rejection rate as per phone status:-
10748 applicants who own the phone - 11.45% of the applicants are rejected.
Rejection rate of the applicants with or without a phone is quite close.
Percentage of phone ownership status as per gender:-
Total applicants who own the phone, out of it 31.04% are the Males.
Females have the highest ownership of phone in comparison to Males.
Rejection rate as per the phone status on gender basis:-
Own the work phone:-
Here Males have HIGHER rejection rate as compared to Females who don't own the phone.
Rejection rate as per rejected applicants:-
And percentage-wise it is 11.77%.
Applicants rejection percentage out of the total rejected applicants owning no phone is 71.31%.
Whereas applicants rejection percentage out of the total rejected applicants owning a phone is 28.69%.
Here, the rejection rate of applicants without a phone is much HIGHER than that of the applicants owning a phone.
Overall, we can clearly see that the REJECTION RATE OF APPLICANTS is not much impacted with or without the phone.
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.flag_email.value_counts()
0 33186 1 3271 Name: flag_email, dtype: int64
binary_df.flag_email.value_counts(normalize = True)
0 0.91 1 0.09 Name: flag_email, dtype: float64
Analysis:-
# Use crosstabs
pd.crosstab(binary_df['flag_email'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_email | |||
| 0 | 29346 | 3840 | 33186 |
| 1 | 2820 | 451 | 3271 |
| All | 32166 | 4291 | 36457 |
# Total Male and Female gender distribution
plt.subplots(figsize = (8, 8))
sns.countplot(binary_df['flag_email'])
<AxesSubplot:xlabel='flag_email', ylabel='count'>
# Find the applicants count who don't own an email w.r.t. status
e_st_count = binary_df.groupby(["flag_email"])["status"].value_counts(normalize = False).reset_index(name = 'count')
e_st_count
| flag_email | status | count | |
|---|---|---|---|
| 0 | 0 | 0 | 29346 |
| 1 | 0 | 1 | 3840 |
| 2 | 1 | 0 | 2820 |
| 3 | 1 | 1 | 451 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_email", y = "count", hue = "status", data = e_st_count)
plt.show()
# Find the applicants percentage who don't own an email w.r.t. status
e_st_perc = binary_df.groupby(["flag_email"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
e_st_perc
| flag_email | status | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.88 |
| 1 | 0 | 1 | 0.12 |
| 2 | 1 | 0 | 0.86 |
| 3 | 1 | 1 | 0.14 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_email", y = "perc", hue = "status", data = e_st_perc)
plt.show()
Analysis:-
* 11.57% of the applicants who don't own an email are rejected.
* 13.78% of the applicants who own an email are rejected.
# Find the applicants count who don't own an email w.r.t. gender
e_gen_count = binary_df.groupby(["flag_email"])["code_gender"].value_counts(normalize = False).reset_index(name = 'count')
e_gen_count
| flag_email | level_1 | count | |
|---|---|---|---|
| 0 | 0 | 0 | 22222 |
| 1 | 0 | 1 | 10964 |
| 2 | 1 | 0 | 2208 |
| 3 | 1 | 1 | 1063 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_email", y = "count", hue = "level_1", data = e_gen_count)
plt.show()
# Find the applicants percentage who don't own an email w.r.t. gender
e_gen_perc = binary_df.groupby(["flag_email"])["code_gender"].value_counts(normalize = True).reset_index(name = 'perc')
e_gen_perc
| flag_email | level_1 | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.67 |
| 1 | 0 | 1 | 0.33 |
| 2 | 1 | 0 | 0.68 |
| 3 | 1 | 1 | 0.32 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "flag_email", y = "perc", hue = "level_1", data = e_gen_perc)
plt.xlabel("Gender Distribution of Email")
plt.ylabel("Percentage")
plt.grid(False)
plt.legend(title = "Gender", loc = 1)
plt.show()
Analysis:-
* Out of 33186 applicants who don't own an email - 66.96% are Females and 33.04% are Males
* Similarly, out of 3271 applicants who own a phone - 67.50% are Females and 32.50% are Males
# Find the applicants count who don't own an email w.r.t. status and gender
e_gen_st_count = binary_df.groupby(["flag_email", 'code_gender'])["status"].value_counts(normalize = False).reset_index(name = 'count')
e_gen_st_count
| flag_email | code_gender | status | count | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 19751 |
| 1 | 0 | 0 | 1 | 2471 |
| 2 | 0 | 1 | 0 | 9595 |
| 3 | 0 | 1 | 1 | 1369 |
| 4 | 1 | 0 | 0 | 1921 |
| 5 | 1 | 0 | 1 | 287 |
| 6 | 1 | 1 | 0 | 899 |
| 7 | 1 | 1 | 1 | 164 |
# Find the applicants count who don't own an email w.r.t. status and gender
e_gen_st_perc = binary_df.groupby(["flag_email", 'code_gender'])["status"].value_counts(normalize = True).reset_index(name = 'perc')
e_gen_st_perc
| flag_email | code_gender | status | perc | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0.89 |
| 1 | 0 | 0 | 1 | 0.11 |
| 2 | 0 | 1 | 0 | 0.88 |
| 3 | 0 | 1 | 1 | 0.12 |
| 4 | 1 | 0 | 0 | 0.87 |
| 5 | 1 | 0 | 1 | 0.13 |
| 6 | 1 | 1 | 0 | 0.85 |
| 7 | 1 | 1 | 1 | 0.15 |
Analysis:-
Applicants who don't own an email:
* 11.11% of Females who don't own an email are rejected.
* 12.48% of Males who don't own an email are rejected.
Applicants who own an email:
* 12.99% of Females who own an email are rejected.
* 15.42% of applicants who own an email are rejected.
# Check an email count of rejection and acceptance on the basis of status
e_df = binary_df.groupby(["flag_email", 'status'])["status"].count()
e_df
flag_email status
0 0 29346
1 3840
1 0 2820
1 451
Name: status, dtype: int64
# Check an email percentage of rejection and acceptance on the basis of status
e_df = binary_df.groupby(["flag_email"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
e_df
| flag_email | status | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.88 |
| 1 | 0 | 1 | 0.12 |
| 2 | 1 | 0 | 0.86 |
| 3 | 1 | 1 | 0.14 |
# Total rejection count of applicants who don't own an email (N)
e_df_n = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_email == 0]
e_df_n.shape[0]
3840
# Total rejection count of applicants who own an email (Y)
e_df_y = binary_df.loc[binary_df.status == 1].loc[binary_df.flag_email == 1]
e_df_y.shape[0]
451
# Total rejections
e_tot = e_df_n.shape[0] + e_df_y.shape[0]
e_tot
4291
# Total eligibles
e_df_n_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_email == 0]
print("Total Eligible without an email: " + str(e_df_n_eleg.shape[0]))
e_df_y_eleg = binary_df.loc[binary_df.status == 0].loc[binary_df.flag_email == 1]
print("Total Eligible with an email: " + str(e_df_y_eleg.shape[0]))
e_eleg = e_df_n_eleg.shape[0] + e_df_y_eleg.shape[0]
print("Total Eligible applicants : " + str(e_eleg))
Total Eligible without an email: 29346 Total Eligible with an email: 2820 Total Eligible applicants : 32166
# Percencatage of rejection of applicants with or without an email out of total rejections
print('There are ' + str(e_tot) + ' rejected applicants.')
print('Out of this:-')
print('Applicants without an email are', e_df_n.shape[0])
print('Applicants with an email are', e_df_y.shape[0], '\n')
print('Percentage of rejection of applicants without an email out of total rejections is',
str(round(e_df_n.shape[0]/e_tot * 100, 2)) + '%.')
print('Percentage of rejection of applicants with an email out of total rejections is',
str(round(e_df_y.shape[0]/e_tot * 100, 2)) + '%.', '\n', '\n')
print('There are ' + str(e_eleg) + ' eligible applicants.')
print('Out of this:-')
print('Applicants without an email are', e_df_n_eleg.shape[0])
print('Applicants with an email are', e_df_y_eleg.shape[0], '\n')
print('Percentage of eligibility of applicants without an email out of total eligible is', str(round(e_df_n_eleg.shape[0] / e_eleg * 100, 2)) + '%.')
print('Percentage of eligibility of applicants with an email out of total eligible is', str(round(e_df_y_eleg.shape[0] / e_eleg * 100, 2)) + '%.')
There are 4291 rejected applicants. Out of this:- Applicants without an email are 3840 Applicants with an email are 451 Percentage of rejection of applicants without an email out of total rejections is 89.49%. Percentage of rejection of applicants with an email out of total rejections is 10.51%. There are 32166 eligible applicants. Out of this:- Applicants without an email are 29346 Applicants with an email are 2820 Percentage of eligibility of applicants without an email out of total eligible is 91.23%. Percentage of eligibility of applicants with an email out of total eligible is 8.77%.
pd.crosstab(binary_df['flag_email'], binary_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_email | |||
| 0 | 29346 | 3840 | 33186 |
| 1 | 2820 | 451 | 3271 |
| All | 32166 | 4291 | 36457 |
# Create a new dataframe of just an email and then add status to it
e_tot_df = ['flag_email']
e_df_perc = binary_df[e_tot_df + ['status']]
e_df_perc.head()
| flag_email | status | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 0 | 1 |
| 2 | 0 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 0 |
e_df_perc.value_counts()
flag_email status
0 0 29346
1 3840
1 0 2820
1 451
dtype: int64
dict_list = []
for flag_email in e_tot_df:
for one_type in [0, 1]:
dict_list.append({'feature': flag_email,
'type': one_type,
'reject_rate_percentage': round(len(e_df_perc[e_df_perc[flag_email] == one_type]
[e_df_perc.status == 1])
/ len(e_df_perc[e_df_perc[flag_email] == one_type]) * 100, 2),
'count': len(e_df_perc[e_df_perc[flag_email] == one_type]),
'reject_count': len(e_df_perc[e_df_perc[flag_email] == one_type][e_df_perc.status == 1])
})
e_binary = pd.DataFrame.from_dict(dict_list)
e_binary
| feature | type | reject_rate_percentage | count | reject_count | |
|---|---|---|---|---|---|
| 0 | flag_email | 0 | 11.57 | 33186 | 3840 |
| 1 | flag_email | 1 | 13.79 | 3271 | 451 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "feature", y = "reject_rate_percentage", hue = "type", data = e_binary)
plt.grid(False)
plt.legend(title = "Email", loc = 1)
plt.show()
Percentage as per not owning an email:-
It means that 91.03% of the applicants don't own an email.
Total applicants are 36457 and out of them 3271 applicants own an email.
It means that 8.97% of the applicants own an email.
Percentage of applicants without an email is HIGHER than those who own an email.
Rejection rate as per email status:-
3271 applicants who own an email - 13.78% of the applicants are rejected.
Rejection rate of the applicants with an email is bit HIGHER than the ones without an email.
Percentage of email ownership status as per gender:-
Total applicants who own an email, out of it 32.50% are the Males.
Females have the highest ownership of emails in comparison to Males.
Rejection rate as per an email status on gender basis:-
Own an email:-
Here Males have HIGHER rejection rate as compared to Females who don't own an email.
Rejection rate as per rejected applicants:-
And percentage-wise it is 11.77%.
Applicants rejection percentage out of the total rejected applicants owning no email is 89.49%.
Whereas applicants rejection percentage out of the total rejected applicants owning an email is 10.51%.
Here, the rejection rate of applicants without an email is much HIGHER than that of the applicants owning an email.
Overall, we can clearly see that the REJECTION RATE OF APPLICANTS is not much impacted with or without an email.
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
binary_features = ['code_gender', 'flag_own_car', 'flag_own_realty', 'flag_work_phone', 'flag_phone', 'flag_email']
binary_df_plot = binary_df[binary_features + ['status']]
dict_list = []
for feature in binary_features:
for one_type in [0, 1]:
dict_list.append({'feature': feature,
'type': one_type,
'reject_rate_percentage': round(len(binary_df_plot[binary_df_plot[feature] == one_type][binary_df_plot.status == 1]) /
len(binary_df_plot[binary_df_plot[feature] == one_type]) * 100, 2),
'count': len(binary_df_plot[binary_df_plot[feature] == one_type]),
'reject_count': len(binary_df_plot[binary_df_plot[feature] == one_type][binary_df_plot.status == 1])
})
binary_df_plot.head()
| code_gender | flag_own_car | flag_own_realty | flag_work_phone | flag_phone | flag_email | status | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| 3 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
| 4 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
group_binary = pd.DataFrame.from_dict(dict_list)
group_binary
| feature | type | reject_rate_percentage | count | reject_count | |
|---|---|---|---|---|---|
| 0 | code_gender | 0 | 11.29 | 24430 | 2758 |
| 1 | code_gender | 1 | 12.75 | 12027 | 1533 |
| 2 | flag_own_car | 0 | 12.04 | 22614 | 2722 |
| 3 | flag_own_car | 1 | 11.33 | 13843 | 1569 |
| 4 | flag_own_realty | 0 | 13.06 | 11951 | 1561 |
| 5 | flag_own_realty | 1 | 11.14 | 24506 | 2730 |
| 6 | flag_work_phone | 0 | 11.77 | 28235 | 3324 |
| 7 | flag_work_phone | 1 | 11.76 | 8222 | 967 |
| 8 | flag_phone | 0 | 11.90 | 25709 | 3060 |
| 9 | flag_phone | 1 | 11.45 | 10748 | 1231 |
| 10 | flag_email | 0 | 11.57 | 33186 | 3840 |
| 11 | flag_email | 1 | 13.79 | 3271 | 451 |
plt.subplots(figsize = (20, 12))
sns.barplot(y = "feature", x = "reject_rate_percentage", hue = "type", data = group_binary, orient = 'h')
plt.grid(False)
plt.legend(title = "Type", loc = 1)
plt.show()
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 1 | 0 | 0 | 0 | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | 1 | 0 | 0 | Managers | 2.00 | 1 |
| 2 | 0 | 0 | 1 | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | 1 | 0 | 0 | Security staff | 2.00 | 0 |
| 3 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
| 4 | 0 | 1 | 1 | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | 0 | 0 | 0 | NaN | 2.00 | 0 |
# Convert the binaries of variables back from '1's and '0's to 'Y's and 'N's or to 'M's and 'F's
# Where Y = 1 or M = 1 and
# N = 0 or F = 1
binary_df['code_gender'] = binary_df['code_gender'].replace(1, 'M').replace(0, 'F')
binary_df['flag_own_car'] = binary_df['flag_own_car'].replace(1, 'Y').replace(0, 'N')
binary_df['flag_own_realty'] = binary_df['flag_own_realty'].replace(1, 'Y').replace(0, 'N')
binary_df['flag_work_phone'] = binary_df['flag_work_phone'].replace(1, 'Y').replace(0, 'N')
binary_df['flag_phone'] = binary_df['flag_phone'].replace(1, 'Y').replace(0, 'N')
binary_df['flag_email'] = binary_df['flag_email'].replace(1, 'Y').replace(0, 'N')
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 |
There are 5 binary features in a dataset 'new_df':-
binary_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 |
continuous_df = binary_df.copy()
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 |
continuous_df.shape
(36457, 17)
numerical_col = continuous_df.select_dtypes(include='number').columns
len(numerical_col)
6
continuous_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 dtypes: category(6), float64(2), int64(4), object(5) memory usage: 3.3+ MB
fig , axes = plt.subplots(nrows = 3, ncols = 2, constrained_layout = True)
fig.subplots_adjust(left = 0, bottom = 0, right = 3, top = 5, wspace = 0.09, hspace = 0.3)
for ax, column in zip(axes.flatten(), numerical_col):
sns.boxplot(continuous_df[column], ax = ax)
plt.grid(False)
plt.show()
Insights:-
plt.figure(figsize=(10, 8))
sns.countplot(x = "cnt_children", data = continuous_df, palette = "viridis_r")
children_count = continuous_df.cnt_children.value_counts()
children_count
for a, b in zip(range(len(children_count)), children_count):
plt.text(a, b, '%.0f' % b, ha = 'center', va = 'bottom', fontsize = 14)
plt.grid(False)
plt.show()
Palette color codes:-
‘Accent’, ‘Accent_r’, ‘Blues’, ‘Blues_r’, ‘BrBG’, ‘BrBG_r’, ‘BuGn’, ‘BuGn_r’, ‘BuPu’, ‘BuPu_r’, ‘CMRmap’, ‘CMRmap_r’, ‘Dark2’, ‘Dark2_r’, ‘GnBu’, ‘GnBu_r’, ‘Greens’, ‘Greens_r’, ‘Greys’, ‘Greys_r’, ‘OrRd’, ‘OrRd_r’, ‘Oranges’, ‘Oranges_r’, ‘PRGn’, ‘PRGn_r’, ‘Paired’, ‘Paired_r’, ‘Pastel1’, ‘Pastel1_r’, ‘Pastel2’, ‘Pastel2_r’, ‘PiYG’, ‘PiYG_r’, ‘PuBu’, ‘PuBuGn’, ‘PuBuGn_r’, ‘PuBu_r’, ‘PuOr’, ‘PuOr_r’, ‘PuRd’, ‘PuRd_r’, ‘Purples’, ‘Purples_r’, ‘RdBu’, ‘RdBu_r’, ‘RdGy’, ‘RdGy_r’, ‘RdPu’, ‘RdPu_r’, ‘RdYlBu’, ‘RdYlBu_r’, ‘RdYlGn’, ‘RdYlGn_r’, ‘Reds’, ‘Reds_r’, ‘Set1’, ‘Set1_r’, ‘Set2’, ‘Set2_r’, ‘Set3’, ‘Set3_r’, ‘Spectral’, ‘Spectral_r’, ‘Wistia’, ‘Wistia_r’, ‘YlGn’, ‘YlGnBu’, ‘YlGnBu_r’, ‘YlGn_r’, ‘YlOrBr’, ‘YlOrBr_r’, ‘YlOrRd’, ‘YlOrRd_r’, ‘afmhot’, ‘afmhot_r’, ‘autumn’, ‘autumn_r’, ‘binary’, ‘binary_r’, ‘bone’, ‘bone_r’, ‘brg’, ‘brg_r’, ‘bwr’, ‘bwr_r’, ‘cividis’, ‘cividis_r’, ‘cool’, ‘cool_r’, ‘coolwarm’, ‘coolwarm_r’, ‘copper’, ‘copper_r’, ‘cubehelix’, ‘cubehelix_r’, ‘flag’, ‘flag_r’, ‘gist_earth’, ‘gist_earth_r’, ‘gist_gray’, ‘gist_gray_r’, ‘gist_heat’, ‘gist_heat_r’, ‘gist_ncar’, ‘gist_ncar_r’, ‘gist_rainbow’, ‘gist_rainbow_r’, ‘gist_stern’, ‘gist_stern_r’, ‘gist_yarg’, ‘gist_yarg_r’, ‘gnuplot’, ‘gnuplot2’, ‘gnuplot2_r’, ‘gnuplot_r’, ‘gray’, ‘gray_r’, ‘hot’, ‘hot_r’, ‘hsv’, ‘hsv_r’, ‘icefire’, ‘icefire_r’, ‘inferno’, ‘inferno_r’, ‘jet’, ‘jet_r’, ‘magma’, ‘magma_r’, ‘mako’, ‘mako_r’, ‘nipy_spectral’, ‘nipy_spectral_r’, ‘ocean’, ‘ocean_r’, ‘pink’, ‘pink_r’, ‘plasma’, ‘plasma_r’, ‘prism’, ‘prism_r’, ‘rainbow’, ‘rainbow_r’, ‘rocket’, ‘rocket_r’, ‘seismic’, ‘seismic_r’, ‘spring’, ‘spring_r’, ‘summer’, ‘summer_r’, ‘tab10’, ‘tab10_r’,’tab20′, ‘tab20_r’, ‘tab20b’, ‘tab20b_r’, ‘tab20c’, ‘tab20c_r’, ‘terrain’, ‘terrain_r’, ‘turbo’, ‘turbo_r’, ‘twilight’, ‘twilight_r’, ‘twilight_shifted’, ‘twilight_shifted_r’, ‘viridis’, ‘viridis_r’, ‘vlag’, ‘vlag_r’, ‘winter’, ‘winter_r’
continuous_df.cnt_children.value_counts()
0 25201 1 7492 2 3256 3 419 4 63 5 20 14 3 7 2 19 1 Name: cnt_children, dtype: int64
continuous_df.cnt_children.value_counts(normalize = True)
0 0.69 1 0.21 2 0.09 3 0.01 4 0.00 5 0.00 14 0.00 7 0.00 19 0.00 Name: cnt_children, dtype: float64
# Find the applicants children count w.r.t. status
child_st_count = continuous_df.groupby(["cnt_children"])["status"].value_counts(normalize = False).reset_index(name = 'count')
child_st_count
| cnt_children | status | count | |
|---|---|---|---|
| 0 | 0 | 0 | 22259 |
| 1 | 0 | 1 | 2942 |
| 2 | 1 | 0 | 6642 |
| 3 | 1 | 1 | 850 |
| 4 | 2 | 0 | 2821 |
| 5 | 2 | 1 | 435 |
| 6 | 3 | 0 | 364 |
| 7 | 3 | 1 | 55 |
| 8 | 4 | 0 | 58 |
| 9 | 4 | 1 | 5 |
| 10 | 5 | 0 | 20 |
| 11 | 7 | 1 | 2 |
| 12 | 14 | 1 | 2 |
| 13 | 14 | 0 | 1 |
| 14 | 19 | 0 | 1 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = "cnt_children", y = "count", hue = "status", data = child_st_count)
plt.grid(False)
plt.legend(loc = 1, title = 'Status')
plt.show()
# Find the applicants children percentage w.r.t. status
child_st_perc = continuous_df.groupby(["cnt_children"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
child_st_perc
| cnt_children | status | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.88 |
| 1 | 0 | 1 | 0.12 |
| 2 | 1 | 0 | 0.89 |
| 3 | 1 | 1 | 0.11 |
| 4 | 2 | 0 | 0.87 |
| 5 | 2 | 1 | 0.13 |
| 6 | 3 | 0 | 0.87 |
| 7 | 3 | 1 | 0.13 |
| 8 | 4 | 0 | 0.92 |
| 9 | 4 | 1 | 0.08 |
| 10 | 5 | 0 | 1.00 |
| 11 | 7 | 1 | 1.00 |
| 12 | 14 | 1 | 0.67 |
| 13 | 14 | 0 | 0.33 |
| 14 | 19 | 0 | 1.00 |
plt.subplots(figsize = (16, 8))
sns.barplot(x = "cnt_children", y = "perc", hue = "status", data = child_st_perc)
plt.grid(False)
plt.legend(loc = 1, title = 'Status')
plt.show()
# Check the children count
child_df = continuous_df["cnt_children"].value_counts()
child_df
0 25201 1 7492 2 3256 3 419 4 63 5 20 14 3 7 2 19 1 Name: cnt_children, dtype: int64
# Check the children percentage
child_df_perc = continuous_df["cnt_children"].value_counts(normalize = True).reset_index(name = 'perc')
child_df_perc
| index | perc | |
|---|---|---|
| 0 | 0 | 0.69 |
| 1 | 1 | 0.21 |
| 2 | 2 | 0.09 |
| 3 | 3 | 0.01 |
| 4 | 4 | 0.00 |
| 5 | 5 | 0.00 |
| 6 | 14 | 0.00 |
| 7 | 7 | 0.00 |
| 8 | 19 | 0.00 |
Analysis:-
* Very few applicants have more than 2 child.
# Check the children count of rejection and acceptance on the basis of status
child__st_df_count = continuous_df.groupby(["cnt_children"])["status"].value_counts()
child__st_df_count
cnt_children status
0 0 22259
1 2942
1 0 6642
1 850
2 0 2821
1 435
3 0 364
1 55
4 0 58
1 5
5 0 20
7 1 2
14 1 2
0 1
19 0 1
Name: status, dtype: int64
# Check the children count of rejection on the basis of status
#child__st_df_count_r = new_df.groupby(["total_children"])["status"]
child__st_df_count_r = continuous_df.loc[continuous_df.status == 1]
child__st_df_count_r = child__st_df_count_r.cnt_children.value_counts(normalize = False).reset_index(name = 'count')
child__st_df_count_r
| index | count | |
|---|---|---|
| 0 | 0 | 2942 |
| 1 | 1 | 850 |
| 2 | 2 | 435 |
| 3 | 3 | 55 |
| 4 | 4 | 5 |
| 5 | 7 | 2 |
| 6 | 14 | 2 |
# Rename the index to total_children
child__st_df_count_r = child__st_df_count_r.rename(columns = {'index' : 'cnt_children'})
child__st_df_count_r
| cnt_children | count | |
|---|---|---|
| 0 | 0 | 2942 |
| 1 | 1 | 850 |
| 2 | 2 | 435 |
| 3 | 3 | 55 |
| 4 | 4 | 5 |
| 5 | 7 | 2 |
| 6 | 14 | 2 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = 'cnt_children', y = 'count', data = child__st_df_count_r)
plt.grid(False)
plt.show()
# Check the children percentage of rejection and acceptance on the basis of status
child__st_df_perc = continuous_df.groupby(["cnt_children"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
child__st_df_perc
| cnt_children | status | perc | |
|---|---|---|---|
| 0 | 0 | 0 | 0.88 |
| 1 | 0 | 1 | 0.12 |
| 2 | 1 | 0 | 0.89 |
| 3 | 1 | 1 | 0.11 |
| 4 | 2 | 0 | 0.87 |
| 5 | 2 | 1 | 0.13 |
| 6 | 3 | 0 | 0.87 |
| 7 | 3 | 1 | 0.13 |
| 8 | 4 | 0 | 0.92 |
| 9 | 4 | 1 | 0.08 |
| 10 | 5 | 0 | 1.00 |
| 11 | 7 | 1 | 1.00 |
| 12 | 14 | 1 | 0.67 |
| 13 | 14 | 0 | 0.33 |
| 14 | 19 | 0 | 1.00 |
# Fetch only the rejected records of children percentage
child__st_df_perc_r = child__st_df_perc.loc[child__st_df_perc.status == 1]
child__st_df_perc_r
| cnt_children | status | perc | |
|---|---|---|---|
| 1 | 0 | 1 | 0.12 |
| 3 | 1 | 1 | 0.11 |
| 5 | 2 | 1 | 0.13 |
| 7 | 3 | 1 | 0.13 |
| 9 | 4 | 1 | 0.08 |
| 11 | 7 | 1 | 1.00 |
| 12 | 14 | 1 | 0.67 |
plt.subplots(figsize = (8, 8))
sns.barplot(x = 'cnt_children', y = 'perc', data = child__st_df_perc_r)
plt.grid(False)
plt.show()
Analysis:-
* The rejection rate of applicants with 0, 1, 2 or 3 children are not quite different.
# Dividing applicants into 5 parts on the basis of rejection count
child_count_5 = [children_count[0], children_count[1], children_count[2], children_count[3], children_count[4:].sum()]
child_count_5
[25201, 7492, 3256, 419, 89]
child_count_5_r = [len(continuous_df[continuous_df.cnt_children == 0][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_children == 1][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_children == 2][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_children == 3][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_children >= 4][continuous_df.status == 1])]
child_count_5_r
[2942, 850, 435, 55, 9]
child_count_df_5 = pd.DataFrame.from_dict({
'children_count' : child_count_5,
'reject_count' : child_count_5_r
})
child_count_df_5['approved_count'] = child_count_df_5.children_count - child_count_df_5.reject_count
child_count_df_5['reject_rate'] = child_count_df_5.reject_count / child_count_df_5.children_count
child_count_df_5
| children_count | reject_count | approved_count | reject_rate | |
|---|---|---|---|---|
| 0 | 25201 | 2942 | 22259 | 0.12 |
| 1 | 7492 | 850 | 6642 | 0.11 |
| 2 | 3256 | 435 | 2821 | 0.13 |
| 3 | 419 | 55 | 364 | 0.13 |
| 4 | 89 | 9 | 80 | 0.10 |
plt.subplots(figsize = (12, 8))
sns.barplot(x = 'children_count', y = 'reject_rate', data = child_count_df_5)
plt.grid(False)
plt.show()
# Create new columns in new_df for children count and copy its contents
continuous_df['children_cnt_bucket'] = continuous_df['cnt_children']
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | 0 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | 0 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | 0 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | 0 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | 0 |
continuous_df.shape
(36457, 18)
continuous_df.children_cnt_bucket.value_counts()
0 25201 1 7492 2 3256 3 419 4 63 5 20 14 3 7 2 19 1 Name: children_cnt_bucket, dtype: int64
# Create buckets
continuous_df['children_cnt_bucket'] = continuous_df['cnt_children']
continuous_df['children_cnt_bucket'].value_counts()
0 25201 1 7492 2 3256 3 419 4 63 5 20 14 3 7 2 19 1 Name: children_cnt_bucket, dtype: int64
continuous_df.loc[(continuous_df.children_cnt_bucket > 5), 'children_cnt_bucket'] = 'More than Five'
continuous_df['children_cnt_bucket'].value_counts()
0 25201 1 7492 2 3256 3 419 4 63 5 20 More than Five 6 Name: children_cnt_bucket, dtype: int64
continuous_df['children_cnt_bucket'] = continuous_df['children_cnt_bucket'].replace(0, 'None').replace(
1, 'One').replace(2, 'Two').replace(3, 'Three').replace(4, 'Four').replace(5, 'Five')
continuous_df['children_cnt_bucket'].value_counts()
None 25201 One 7492 Two 3256 Three 419 Four 63 Five 20 More than Five 6 Name: children_cnt_bucket, dtype: int64
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | None |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None |
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | None |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None |
continuous_df.shape
(36457, 18)
plt.figure(figsize=(16, 8))
sns.countplot(x = "cnt_fam_members", data = continuous_df, palette = "viridis_r")
family_count = continuous_df.cnt_fam_members.value_counts()
family_count
for a, b in zip(range(len(family_count)), family_count):
plt.text(a, b, '%.0f' % b, ha = 'center', va = 'bottom', fontsize = 14)
plt.grid(False)
plt.show()
continuous_df.cnt_fam_members.value_counts()
2.00 19463 1.00 6987 3.00 6421 4.00 3106 5.00 397 6.00 58 7.00 19 15.00 3 9.00 2 20.00 1 Name: cnt_fam_members, dtype: int64
continuous_df.cnt_fam_members.value_counts(normalize = True)
2.00 0.53 1.00 0.19 3.00 0.18 4.00 0.09 5.00 0.01 6.00 0.00 7.00 0.00 15.00 0.00 9.00 0.00 20.00 0.00 Name: cnt_fam_members, dtype: float64
# Find the applicants family count w.r.t. status
fam_st_count = continuous_df.groupby(["cnt_fam_members"])["status"].value_counts(normalize = False).reset_index(name = 'count')
fam_st_count
| cnt_fam_members | status | count | |
|---|---|---|---|
| 0 | 1.00 | 0 | 6162 |
| 1 | 1.00 | 1 | 825 |
| 2 | 2.00 | 0 | 17200 |
| 3 | 2.00 | 1 | 2263 |
| 4 | 3.00 | 0 | 5698 |
| 5 | 3.00 | 1 | 723 |
| 6 | 4.00 | 0 | 2683 |
| 7 | 4.00 | 1 | 423 |
| 8 | 5.00 | 0 | 349 |
| 9 | 5.00 | 1 | 48 |
| 10 | 6.00 | 0 | 53 |
| 11 | 6.00 | 1 | 5 |
| 12 | 7.00 | 0 | 19 |
| 13 | 9.00 | 1 | 2 |
| 14 | 15.00 | 1 | 2 |
| 15 | 15.00 | 0 | 1 |
| 16 | 20.00 | 0 | 1 |
plt.subplots(figsize = (12, 8))
sns.barplot(x = "cnt_fam_members", y = "count", hue = "status", data = fam_st_count)
plt.grid(False)
plt.legend(loc = 1, title = "Status")
plt.show()
# Find the applicants family percentage w.r.t. status
fam_st_perc = continuous_df.groupby(["cnt_fam_members"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
fam_st_perc
| cnt_fam_members | status | perc | |
|---|---|---|---|
| 0 | 1.00 | 0 | 0.88 |
| 1 | 1.00 | 1 | 0.12 |
| 2 | 2.00 | 0 | 0.88 |
| 3 | 2.00 | 1 | 0.12 |
| 4 | 3.00 | 0 | 0.89 |
| 5 | 3.00 | 1 | 0.11 |
| 6 | 4.00 | 0 | 0.86 |
| 7 | 4.00 | 1 | 0.14 |
| 8 | 5.00 | 0 | 0.88 |
| 9 | 5.00 | 1 | 0.12 |
| 10 | 6.00 | 0 | 0.91 |
| 11 | 6.00 | 1 | 0.09 |
| 12 | 7.00 | 0 | 1.00 |
| 13 | 9.00 | 1 | 1.00 |
| 14 | 15.00 | 1 | 0.67 |
| 15 | 15.00 | 0 | 0.33 |
| 16 | 20.00 | 0 | 1.00 |
plt.subplots(figsize = (16, 8))
sns.barplot(x = "cnt_fam_members", y = "perc", hue = "status", data = fam_st_perc)
plt.grid(False)
plt.show()
# Check the family count
fam_df = continuous_df["cnt_fam_members"].value_counts()
fam_df
2.00 19463 1.00 6987 3.00 6421 4.00 3106 5.00 397 6.00 58 7.00 19 15.00 3 9.00 2 20.00 1 Name: cnt_fam_members, dtype: int64
# Check the family percentage
fam_df_perc = continuous_df["cnt_fam_members"].value_counts(normalize = True).reset_index(name = 'perc')
fam_df_perc
| index | perc | |
|---|---|---|
| 0 | 2.00 | 0.53 |
| 1 | 1.00 | 0.19 |
| 2 | 3.00 | 0.18 |
| 3 | 4.00 | 0.09 |
| 4 | 5.00 | 0.01 |
| 5 | 6.00 | 0.00 |
| 6 | 7.00 | 0.00 |
| 7 | 15.00 | 0.00 |
| 8 | 9.00 | 0.00 |
| 9 | 20.00 | 0.00 |
Analysis:-
* Very few applicants have more than 4 family members
# Check the family count of rejection and acceptance on the basis of status
fam_st_df_count = continuous_df.groupby(["cnt_fam_members"])["status"].value_counts()
fam_st_df_count
cnt_fam_members status
1.00 0 6162
1 825
2.00 0 17200
1 2263
3.00 0 5698
1 723
4.00 0 2683
1 423
5.00 0 349
1 48
6.00 0 53
1 5
7.00 0 19
9.00 1 2
15.00 1 2
0 1
20.00 0 1
Name: status, dtype: int64
# Check the family count of rejection on the basis of status
#fam_st_df_count_r = new_df.groupby(["total_children"])["status"]
fam_st_df_count_r = continuous_df.loc[continuous_df.status == 1]
fam_st_df_count_r = fam_st_df_count_r.cnt_fam_members.value_counts(normalize = False).reset_index(name = 'count')
fam_st_df_count_r
| index | count | |
|---|---|---|
| 0 | 2.00 | 2263 |
| 1 | 1.00 | 825 |
| 2 | 3.00 | 723 |
| 3 | 4.00 | 423 |
| 4 | 5.00 | 48 |
| 5 | 6.00 | 5 |
| 6 | 9.00 | 2 |
| 7 | 15.00 | 2 |
# Rename the index to total_family_members
fam_st_df_count_r = fam_st_df_count_r.rename(columns = {'index' : 'cnt_fam_members'})
fam_st_df_count_r
| cnt_fam_members | count | |
|---|---|---|
| 0 | 2.00 | 2263 |
| 1 | 1.00 | 825 |
| 2 | 3.00 | 723 |
| 3 | 4.00 | 423 |
| 4 | 5.00 | 48 |
| 5 | 6.00 | 5 |
| 6 | 9.00 | 2 |
| 7 | 15.00 | 2 |
plt.subplots(figsize = (12, 8))
sns.barplot(x = 'cnt_fam_members', y = 'count', data = fam_st_df_count_r)
plt.grid(False)
plt.show()
# Check the family percentage of rejection and acceptance on the basis of status
fam_st_df_perc = continuous_df.groupby(["cnt_fam_members"])["status"].value_counts(normalize = True).reset_index(name = 'perc')
fam_st_df_perc
| cnt_fam_members | status | perc | |
|---|---|---|---|
| 0 | 1.00 | 0 | 0.88 |
| 1 | 1.00 | 1 | 0.12 |
| 2 | 2.00 | 0 | 0.88 |
| 3 | 2.00 | 1 | 0.12 |
| 4 | 3.00 | 0 | 0.89 |
| 5 | 3.00 | 1 | 0.11 |
| 6 | 4.00 | 0 | 0.86 |
| 7 | 4.00 | 1 | 0.14 |
| 8 | 5.00 | 0 | 0.88 |
| 9 | 5.00 | 1 | 0.12 |
| 10 | 6.00 | 0 | 0.91 |
| 11 | 6.00 | 1 | 0.09 |
| 12 | 7.00 | 0 | 1.00 |
| 13 | 9.00 | 1 | 1.00 |
| 14 | 15.00 | 1 | 0.67 |
| 15 | 15.00 | 0 | 0.33 |
| 16 | 20.00 | 0 | 1.00 |
# Fetch only the rejected records of children percentage
fam_st_df_perc_r = fam_st_df_perc.loc[fam_st_df_perc.status == 1]
fam_st_df_perc_r
| cnt_fam_members | status | perc | |
|---|---|---|---|
| 1 | 1.00 | 1 | 0.12 |
| 3 | 2.00 | 1 | 0.12 |
| 5 | 3.00 | 1 | 0.11 |
| 7 | 4.00 | 1 | 0.14 |
| 9 | 5.00 | 1 | 0.12 |
| 11 | 6.00 | 1 | 0.09 |
| 13 | 9.00 | 1 | 1.00 |
| 14 | 15.00 | 1 | 0.67 |
plt.subplots(figsize = (12, 8))
sns.barplot(x = 'cnt_fam_members', y = 'perc', data = fam_st_df_perc_r)
plt.grid(False)
plt.show()
Analysis:-
* The rejection rate of applicants with 1, 2 or 3 family members are not quite different.
# Dividing applicants into 4 parts on the basis of rejection count
fam_count_5 = [family_count[1], family_count[2], family_count[3], family_count[4], family_count[5:].sum()]
fam_count_5
[6987, 19463, 6421, 3106, 480]
fam_count_5_r = [len(continuous_df[continuous_df.cnt_fam_members == 1.0][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_fam_members == 2.0][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_fam_members == 3.0][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_fam_members == 4.0][continuous_df.status == 1]),
len(continuous_df[continuous_df.cnt_fam_members >= 5.0][continuous_df.status == 1])]
fam_count_5_r
[825, 2263, 723, 423, 57]
fam_count_df_5 = pd.DataFrame.from_dict({
'family_mem_count' : fam_count_5,
'reject_count' : fam_count_5_r
})
fam_count_df_5['approved_count'] = fam_count_df_5.family_mem_count - fam_count_df_5.reject_count
fam_count_df_5['reject_rate'] = fam_count_df_5.reject_count / fam_count_df_5.family_mem_count
fam_count_df_5
| family_mem_count | reject_count | approved_count | reject_rate | |
|---|---|---|---|---|
| 0 | 6987 | 825 | 6162 | 0.12 |
| 1 | 19463 | 2263 | 17200 | 0.12 |
| 2 | 6421 | 723 | 5698 | 0.11 |
| 3 | 3106 | 423 | 2683 | 0.14 |
| 4 | 480 | 57 | 423 | 0.12 |
plt.subplots(figsize = (12, 8))
sns.barplot(x = 'family_mem_count', y = 'reject_rate', data = fam_count_df_5)
plt.grid(False)
plt.show()
fig = plt.figure()
ax = fig.add_subplot(111)
gp = continuous_df.groupby(by = ['cnt_children', 'cnt_fam_members'])
gp_df = gp.size().reset_index(name = 'times').sort_values('times', ascending = False)[:6]
gp_df['times_10'] = gp_df['times'].apply(lambda x: x/10)
ax.scatter(gp_df['cnt_children'], gp_df['cnt_fam_members'], s = gp_df['times_10'])
plt.xticks(range(4))
plt.yticks(range(6))
plt.xlabel('cnt_children')
plt.ylabel('cnt_fam_members')
plt.grid(False)
plt.show()
gp = continuous_df.groupby(by = ['cnt_children', 'cnt_fam_members'])
gp_df = gp.size().reset_index(name='times').sort_values('times', ascending = False)[:6]
gp_df
| cnt_children | cnt_fam_members | times | |
|---|---|---|---|
| 1 | 0 | 2.00 | 18222 |
| 0 | 0 | 1.00 | 6979 |
| 4 | 1 | 3.00 | 6245 |
| 7 | 2 | 4.00 | 3078 |
| 3 | 1 | 2.00 | 1241 |
| 9 | 3 | 5.00 | 391 |
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | None |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None |
continuous_df.shape
(36457, 18)
# Create buckets
continuous_df.cnt_fam_members.value_counts()
2.00 19463 1.00 6987 3.00 6421 4.00 3106 5.00 397 6.00 58 7.00 19 15.00 3 9.00 2 20.00 1 Name: cnt_fam_members, dtype: int64
continuous_df['cnt_fam_members_bucket'] = continuous_df['cnt_fam_members']
continuous_df['cnt_fam_members_bucket'].value_counts()
2.00 19463 1.00 6987 3.00 6421 4.00 3106 5.00 397 6.00 58 7.00 19 15.00 3 9.00 2 20.00 1 Name: cnt_fam_members_bucket, dtype: int64
continuous_df.loc[(continuous_df.cnt_fam_members_bucket > 7), 'cnt_fam_members_bucket'] = 'More than Seven'
continuous_df['cnt_fam_members_bucket'].value_counts()
2.0 19463 1.0 6987 3.0 6421 4.0 3106 5.0 397 6.0 58 7.0 19 More than Seven 6 Name: cnt_fam_members_bucket, dtype: int64
continuous_df['cnt_fam_members_bucket'] = continuous_df['cnt_fam_members_bucket'].replace(1, 'One').replace(2, 'Two').replace(
3, 'Three').replace(4, 'Four').replace(5, 'Five').replace(6, 'Six').replace(7, 'Seven')
continuous_df['cnt_fam_members_bucket'].value_counts()
Two 19463 One 6987 Three 6421 Four 3106 Five 397 Six 58 Seven 19 More than Seven 6 Name: cnt_fam_members_bucket, dtype: int64
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | None | Two |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two |
continuous_df.shape
(36457, 19)
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | None | Two |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two |
# Distribution Plot
plt.subplots(figsize = (8, 8))
income_plot = pd.Series(continuous_df.amt_income_total, name = "income")
plt.ylabel('Reject_rate')
sns.distplot(income_plot)
plt.ticklabel_format(style='plain')
plt.grid(False)
plt.show()
# Remove the scientific notations
# pd.options.display.float_format = '{:.1f}'.format
# Revert back to scientific notation
# pd.reset_option('display.float_format')
continuous_df.amt_income_total.describe()
count 36457.00 mean 186685.74 std 101789.23 min 27000.00 25% 121500.00 50% 157500.00 75% 225000.00 max 1575000.00 Name: amt_income_total, dtype: float64
# Check the quantiles
np.quantile(continuous_df.amt_income_total, 0.99)
560250.0
continuous_df.amt_income_total.quantile([.01, .25, .5, .75, 0.99])
0.01 54000.00 0.25 121500.00 0.50 157500.00 0.75 225000.00 0.99 560250.00 Name: amt_income_total, dtype: float64
We can notice that most applicants' income is lower than 560250. So we select these applicants to get box plot.
continuous_df.amt_income_total.value_counts()
135000.00 4309
180000.00 3097
157500.00 3089
112500.00 2956
225000.00 2926
...
531000.00 1
73575.00 1
38700.00 1
74061.00 1
160200.00 1
Name: amt_income_total, Length: 265, dtype: int64
# Box Plot
sns.boxplot(x = "status", y = "amt_income_total", data = continuous_df)
plt.grid(False)
plt.show()
plt.boxplot(continuous_df['amt_income_total'])
plt.grid(False)
# IQR
Q1 = np.percentile(continuous_df.amt_income_total, 25)
Q1
121500.0
Q3 = np.percentile(continuous_df.amt_income_total, 75)
Q3
225000.0
Q1,Q3 = np.percentile(continuous_df.amt_income_total, [25,75])
Q1,Q3
(121500.0, 225000.0)
IQR = Q3 - Q1
ul = Q3 + 1.5 * IQR
ll = Q1 - 1.5 * IQR
IQR, ul, ll
(103500.0, 380250.0, -33750.0)
outliers = continuous_df.amt_income_total[(continuous_df.amt_income_total > ul) | (continuous_df.amt_income_total < ll)]
print(outliers.head())
34928 382500.00 34929 382500.00 34930 382500.00 34931 382500.00 34932 382500.00 Name: amt_income_total, dtype: float64
outliers = pd.DataFrame(outliers)
outliers.head()
| amt_income_total | |
|---|---|
| 34928 | 382500.00 |
| 34929 | 382500.00 |
| 34930 | 382500.00 |
| 34931 | 382500.00 |
| 34932 | 382500.00 |
outliers.columns = outliers.columns.str.replace('amt_income_total', 'income_outliers')
outliers.head()
| income_outliers | |
|---|---|
| 34928 | 382500.00 |
| 34929 | 382500.00 |
| 34930 | 382500.00 |
| 34931 | 382500.00 |
| 34932 | 382500.00 |
outliers.income_outliers.value_counts()
450000.00 379 405000.00 282 382500.00 123 675000.00 115 540000.00 87 391500.00 56 427500.00 53 495000.00 43 900000.00 39 387000.00 39 562500.00 35 630000.00 31 585000.00 21 720000.00 19 445500.00 17 472500.00 16 810000.00 15 612000.00 13 560250.00 12 765000.00 9 459000.00 9 1575000.00 8 517500.00 8 423000.00 8 616500.00 8 441000.00 7 652500.00 7 463500.00 7 607500.00 6 1350000.00 6 396000.00 6 787500.00 5 742500.00 5 494100.00 4 945000.00 4 990000.00 4 634500.00 4 414000.00 3 418500.00 3 1125000.00 3 697500.00 2 716323.50 2 468000.00 2 531000.00 1 594000.00 1 432000.00 1 661500.00 1 Name: income_outliers, dtype: int64
outliers.income_outliers.shape
(1529,)
plt.subplots(figsize = (8, 8))
sns.distplot(outliers['income_outliers'])
plt.ticklabel_format(style = 'plain')
plt.grid(False)
non_outliers = continuous_df[continuous_df['amt_income_total'] < ul]
non_outliers.shape
(34928, 19)
plt.figure(figsize=(20,12))
plt.subplot(2,2,1)
sns.distplot(continuous_df['amt_income_total'])
plt.grid(False)
plt.subplot(2,2,2)
sns.boxplot(continuous_df['amt_income_total'])
plt.grid(False)
plt.subplot(2,2,3)
sns.distplot(non_outliers['amt_income_total'])
plt.grid(False)
plt.subplot(2,2,4)
sns.boxplot(non_outliers['amt_income_total'])
plt.grid(False)
plt.show()
continuous_df['income_bucket'] = pd.qcut(continuous_df.amt_income_total,
q = [0, 0.2, 0.5, 0.8, 0.95, 1],
labels = ['Very_low', 'Low', "Medium", 'High', 'Very_high'])
continuous_df['income_bucket'].head()
0 Very_low 1 Very_low 2 Very_low 3 Very_low 4 Very_low Name: income_bucket, dtype: category Categories (5, object): ['Very_low' < 'Low' < 'Medium' < 'High' < 'Very_high']
continuous_df['income_bucket'].value_counts()
Medium 11289 Low 9961 Very_low 8602 High 5026 Very_high 1579 Name: income_bucket, dtype: int64
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two | Very_low |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two | Very_low |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | None | Two | Very_low |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two | Very_low |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two | Very_low |
continuous_df.shape
(36457, 20)
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | days_employed | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two | Very_low |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | -14869 | -5067 | Y | N | N | Managers | 2.00 | 1 | None | Two | Very_low |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | -19128 | -1323 | Y | N | N | Security staff | 2.00 | 0 | None | Two | Very_low |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two | Very_low |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | -20086 | 365243 | N | N | N | NaN | 2.00 | 0 | None | Two | Very_low |
continuous_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category dtypes: category(7), float64(2), int64(4), object(7) memory usage: 3.9+ MB
# We firstly transform the days from birth into years, and get the histogram and Box diagram.
continuous_df['days_birth'] = abs(continuous_df['days_birth'])
continuous_df['days_birth'].head()
0 14869 1 14869 2 19128 3 20086 4 20086 Name: days_birth, dtype: int64
print(continuous_df['days_birth'].unique())
[14869 19128 20086 ... 19235 12999 10142]
print(continuous_df['days_birth'].nunique())
7183
continuous_df['age'] = (continuous_df['days_birth'] / 365.25).astype(int)
continuous_df['age'].unique()
array([40, 52, 54, 65, 57, 48, 61, 60, 56, 53, 62, 33, 36, 39, 50, 35, 64,
34, 49, 30, 58, 38, 27, 43, 59, 47, 37, 24, 55, 66, 42, 29, 31, 28,
63, 46, 51, 23, 67, 32, 26, 68, 44, 45, 41, 25, 22, 21, 20])
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | Y | N | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | Y | N | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | Y | N | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | N | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | N | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 |
5 rows × 21 columns
plt.ylabel('Reject_rate')
age_plot = pd.Series(continuous_df.age, name = "age")
sns.distplot(age_plot)
plt.grid(False)
plt.show()
sns.boxplot(x = "status", y = "age", data = continuous_df)
plt.grid(False)
plt.show()
continuous_df.age.value_counts()
37 1200 40 1154 27 1125 28 1124 39 1092 33 1060 38 1058 32 1045 34 1036 42 1028 35 1020 41 1016 43 992 30 982 36 969 31 962 29 951 44 940 46 910 51 900 47 890 48 857 54 848 59 844 50 844 57 837 56 792 53 781 60 780 45 774 49 758 58 722 52 722 55 674 62 640 63 567 61 536 26 527 64 520 25 475 24 386 65 339 66 252 23 209 22 152 67 130 68 26 21 10 20 1 Name: age, dtype: int64
continuous_df.age.describe()
count 36457.00 mean 43.23 std 11.50 min 20.00 25% 34.00 50% 42.00 75% 53.00 max 68.00 Name: age, dtype: float64
# Binning / Bucketing
continuous_df['age_bucket'] = pd.cut(continuous_df['age'],
bins = [18, 25, 35, 60, 100], labels=['Very_Young', 'Young', 'Middle_Age', 'Senior_Citizen'])
continuous_df[['age','age_bucket']].head()
| age | age_bucket | |
|---|---|---|
| 0 | 40 | Middle_Age |
| 1 | 40 | Middle_Age |
| 2 | 52 | Middle_Age |
| 3 | 54 | Middle_Age |
| 4 | 54 | Middle_Age |
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age |
5 rows × 22 columns
continuous_df.shape
(36457, 22)
continuous_df.age_bucket.value_counts()
Middle_Age 22382 Young 9832 Senior_Citizen 3010 Very_Young 1233 Name: age_bucket, dtype: int64
continuous_df['age'].plot(kind = 'hist', bins = 20, density = True)
plt.grid(False)
# Separate the Good applicants and the Bad applicants
# Good applicants
new_status0 = continuous_df.loc[continuous_df["status"] == 0]
new_status0.shape[0]
32166
# Bad applicants
new_status1 = continuous_df.loc[continuous_df["status"] == 1]
new_status1.shape[0]
4291
plt.figure(figsize = (30, 10))
plt.subplot(121)
plt.title("For Eligible Applicants = 0")
sns.countplot(x = 'status', hue = 'age_bucket', data = new_status0, palette = 'Set2')
plt.grid(False)
plt.subplot(122)
plt.title("For Not-Eligible Applicants = 1")
sns.countplot(x = 'status', hue = 'age_bucket', data = new_status1 , palette = 'Set2')
plt.grid(False)
plt.show()
Insights:-
continuous_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category dtypes: category(8), float64(2), int32(1), int64(4), object(7) memory usage: 4.0+ MB
There are error values in this column. We will drop them first and get the employed year of each applicants
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_phone | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age |
5 rows × 22 columns
continuous_df.days_employed
0 -5067
1 -5067
2 -1323
3 365243
4 365243
...
36452 -2479
36453 -2479
36454 -2479
36455 -2479
36456 -2479
Name: days_employed, Length: 36457, dtype: int64
# We firstly transform the days from employed into years, and get the histogram and Box diagram.
print(continuous_df['days_employed'].unique())
[ -5067 -1323 365243 ... -4456 -6518 -2479]
print(continuous_df['days_employed'].nunique())
3640
continuous_df['employed_years'] = continuous_df[continuous_df.days_employed < 0].days_employed.apply(lambda x: int(-x / 365.25))
print(continuous_df['employed_years'].unique())
[13. 3. nan 21. 15. 0. 1. 4. 25. 5. 10. 14. 8. 2. 23. 7. 22. 11. 9. 17. 34. 6. 37. 12. 16. 20. 31. 19. 29. 26. 27. 18. 24. 30. 32. 28. 41. 35. 39. 33. 42. 36. 40. 38. 43.]
print(continuous_df['employed_years'].nunique())
44
continuous_df['employed_years'].value_counts().head(10)
1.00 3233 4.00 2879 3.00 2815 2.00 2779 0.00 2542 6.00 2127 5.00 2060 8.00 1831 7.00 1642 9.00 1281 Name: employed_years, dtype: int64
continuous_df['employed_years'].isnull().sum()
6135
(continuous_df.isnull().sum() / len(continuous_df) * 100).sort_values(ascending = False)
occupation_type 31.06 employed_years 16.83 flag_phone 0.00 age_bucket 0.00 age 0.00 income_bucket 0.00 cnt_fam_members_bucket 0.00 children_cnt_bucket 0.00 status 0.00 cnt_fam_members 0.00 flag_email 0.00 code_gender 0.00 flag_own_car 0.00 days_employed 0.00 days_birth 0.00 name_housing_type 0.00 name_family_status 0.00 name_education_type 0.00 name_income_type 0.00 amt_income_total 0.00 cnt_children 0.00 flag_own_realty 0.00 flag_work_phone 0.00 dtype: float64
Note:-
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | NaN |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | NaN |
5 rows × 23 columns
# Replacing NaN values with Zero (0), as pensioners are retired and they are not employed.
continuous_df['employed_years'] = continuous_df['employed_years'].replace(np.nan, 0)
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
continuous_df.employed_years.value_counts()
0.00 8677 1.00 3233 4.00 2879 3.00 2815 2.00 2779 6.00 2127 5.00 2060 8.00 1831 7.00 1642 9.00 1281 10.00 1095 12.00 753 11.00 712 14.00 606 15.00 583 13.00 579 18.00 306 20.00 299 16.00 266 19.00 256 17.00 236 21.00 189 22.00 180 23.00 178 25.00 148 27.00 109 24.00 103 26.00 88 29.00 86 32.00 53 28.00 52 31.00 47 35.00 35 33.00 33 30.00 33 34.00 22 41.00 20 39.00 16 40.00 16 36.00 14 38.00 12 42.00 4 37.00 3 43.00 1 Name: employed_years, dtype: int64
continuous_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 11323 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
Note:-
plt.subplots(figsize = (14, 8))
plt.ylabel('Reject_rate')
employed_year_plot = pd.Series(continuous_df.employed_years, name = "employed_years")
sns.distplot(employed_year_plot)
plt.grid(False)
plt.show()
sns.boxplot(x = "status", y = "employed_years", data = continuous_df)
plt.grid(False)
plt.show()
continuous_df.employed_years.describe()
count 36457.00 mean 5.61 std 6.41 min 0.00 25% 1.00 50% 4.00 75% 8.00 max 43.00 Name: employed_years, dtype: float64
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
continuous_df.shape
(36457, 23)
continuous_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category 22 employed_years 36457 non-null float64 dtypes: category(8), float64(3), int32(1), int64(4), object(7) memory usage: 4.3+ MB
comparison_df = continuous_df.copy()
comparison_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
comparison_df['age_5'] = comparison_df.age.apply(lambda x: int(x / 5) * 5)
comparison_df['age_5'].head(10)
0 40 1 40 2 50 3 50 4 50 5 50 6 50 7 50 8 50 9 50 Name: age_5, dtype: int64
# comparison_df['employed_year_5'] = comparison_df[comparison_df.work_experience < 0].work_experience.apply(lambda x: int(-x / 365.25 / 5) * 5)
comparison_df['employed_year_5'] = comparison_df.employed_years.apply(lambda x: int(x / 5) * 5)
comparison_df['employed_year_5'].head(10)
0 10 1 10 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 Name: employed_year_5, dtype: int64
plot_fig = plt.figure()
plt.subplots(figsize = (16, 10))
aei_plot = sns.boxplot(x = "age_5", y = "amt_income_total", hue = 'status', data = comparison_df[comparison_df.amt_income_total <= 382500])
plt.grid(False)
plt.show()
plt.subplots(figsize = (16, 10))
aei_plot = sns.boxplot(x = "employed_year_5", y = "amt_income_total", hue = 'status', data = comparison_df[comparison_df.amt_income_total <= 382500])
plt.grid(False)
plt.show()
<Figure size 864x576 with 0 Axes>
Analysis:-
* As figures above, we can know that in terms of age and income rejected applicants are not quite different from
approved applicants through the combination of five-number summary in boxplot.
* However, in terms of age and employed year, applicants with more than 30 years of service are more likely not to be
rejected.
There are 5 categorical features in a dataset 'continuous_df':-
* name_income_type
* name_education_type
* name_family_status
* name_housing_type
* occupation_type
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
categorical_df = continuous_df.copy()
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
categorical_df.shape
(36457, 23)
categorical_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 11323 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
categorical_df.name_income_type.nunique()
5
categorical_df.name_income_type.unique()
array(['Working', 'Pensioner', 'Commercial associate', 'State servant',
'Student'], dtype=object)
categorical_df.name_income_type.value_counts()
Working 18819 Commercial associate 8490 Pensioner 6152 State servant 2985 Student 11 Name: name_income_type, dtype: int64
categorical_df.name_income_type.value_counts(normalize = True)
Working 0.52 Commercial associate 0.23 Pensioner 0.17 State servant 0.08 Student 0.00 Name: name_income_type, dtype: float64
pd.crosstab(categorical_df['name_income_type'], categorical_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_income_type | |||
| Commercial associate | 7410 | 1080 | 8490 |
| Pensioner | 5508 | 644 | 6152 |
| State servant | 2600 | 385 | 2985 |
| Student | 10 | 1 | 11 |
| Working | 16638 | 2181 | 18819 |
| All | 32166 | 4291 | 36457 |
inctyp_total = categorical_df.groupby(by = ['name_income_type']).size().reset_index(name = 'times')
inctyp_total
| name_income_type | times | |
|---|---|---|
| 0 | Commercial associate | 8490 |
| 1 | Pensioner | 6152 |
| 2 | State servant | 2985 |
| 3 | Student | 11 |
| 4 | Working | 18819 |
inctyp_reject = categorical_df[categorical_df.status == 1].groupby(by = ['name_income_type']).size().reset_index(name = 'reject_times')
inctyp_reject
| name_income_type | reject_times | |
|---|---|---|
| 0 | Commercial associate | 1080 |
| 1 | Pensioner | 644 |
| 2 | State servant | 385 |
| 3 | Student | 1 |
| 4 | Working | 2181 |
inctyp_reject_rate = pd.merge(inctyp_total, inctyp_reject, how = 'outer', on = ['name_income_type']).fillna(0)
inctyp_reject_rate
| name_income_type | times | reject_times | |
|---|---|---|---|
| 0 | Commercial associate | 8490 | 1080 |
| 1 | Pensioner | 6152 | 644 |
| 2 | State servant | 2985 | 385 |
| 3 | Student | 11 | 1 |
| 4 | Working | 18819 | 2181 |
inctyp_reject_rate['reject_rate'] = inctyp_reject_rate.reject_times / inctyp_reject_rate.times
inctyp_reject_rate
| name_income_type | times | reject_times | reject_rate | |
|---|---|---|---|---|
| 0 | Commercial associate | 8490 | 1080 | 0.13 |
| 1 | Pensioner | 6152 | 644 | 0.10 |
| 2 | State servant | 2985 | 385 | 0.13 |
| 3 | Student | 11 | 1 | 0.09 |
| 4 | Working | 18819 | 2181 | 0.12 |
plt.subplots(figsize = (12, 8))
sns.barplot(y = "name_income_type", x = "reject_rate", data = inctyp_reject_rate, orient = 'h')
plt.grid(False)
plt.show()
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
categorical_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category 22 employed_years 36457 non-null float64 dtypes: category(8), float64(3), int32(1), int64(4), object(7) memory usage: 4.3+ MB
categorical_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 11323 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
categorical_df.name_education_type.nunique()
5
categorical_df.name_education_type.unique()
array(['Higher education', 'Secondary / secondary special',
'Lower secondary', 'Incomplete higher', 'Academic degree'],
dtype=object)
categorical_df.name_education_type.value_counts()
Secondary / secondary special 24777 Higher education 9864 Incomplete higher 1410 Lower secondary 374 Academic degree 32 Name: name_education_type, dtype: int64
categorical_df.name_education_type.value_counts(normalize = True)
Secondary / secondary special 0.68 Higher education 0.27 Incomplete higher 0.04 Lower secondary 0.01 Academic degree 0.00 Name: name_education_type, dtype: float64
pd.crosstab(categorical_df['name_education_type'], categorical_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_education_type | |||
| Academic degree | 25 | 7 | 32 |
| Higher education | 8716 | 1148 | 9864 |
| Incomplete higher | 1203 | 207 | 1410 |
| Lower secondary | 335 | 39 | 374 |
| Secondary / secondary special | 21887 | 2890 | 24777 |
| All | 32166 | 4291 | 36457 |
edu_total = categorical_df.groupby(by = ['name_education_type']).size().reset_index(name = 'times')
edu_total
| name_education_type | times | |
|---|---|---|
| 0 | Academic degree | 32 |
| 1 | Higher education | 9864 |
| 2 | Incomplete higher | 1410 |
| 3 | Lower secondary | 374 |
| 4 | Secondary / secondary special | 24777 |
edu_reject = categorical_df[categorical_df.status == 1].groupby(by = ['name_education_type']).size().reset_index(name = 'reject_times')
edu_reject
| name_education_type | reject_times | |
|---|---|---|
| 0 | Academic degree | 7 |
| 1 | Higher education | 1148 |
| 2 | Incomplete higher | 207 |
| 3 | Lower secondary | 39 |
| 4 | Secondary / secondary special | 2890 |
edu_reject_rate = pd.merge(edu_total, edu_reject, how = 'outer', on = ['name_education_type']).fillna(0)
edu_reject_rate
| name_education_type | times | reject_times | |
|---|---|---|---|
| 0 | Academic degree | 32 | 7 |
| 1 | Higher education | 9864 | 1148 |
| 2 | Incomplete higher | 1410 | 207 |
| 3 | Lower secondary | 374 | 39 |
| 4 | Secondary / secondary special | 24777 | 2890 |
edu_reject_rate['reject_rate'] = edu_reject_rate.reject_times / edu_reject_rate.times
edu_reject_rate
| name_education_type | times | reject_times | reject_rate | |
|---|---|---|---|---|
| 0 | Academic degree | 32 | 7 | 0.22 |
| 1 | Higher education | 9864 | 1148 | 0.12 |
| 2 | Incomplete higher | 1410 | 207 | 0.15 |
| 3 | Lower secondary | 374 | 39 | 0.10 |
| 4 | Secondary / secondary special | 24777 | 2890 | 0.12 |
plt.subplots(figsize = (15, 8))
sns.barplot(y = "name_education_type", x = "reject_rate", data = edu_reject_rate, orient = 'h')
plt.grid(False)
plt.show()
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
categorical_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category 22 employed_years 36457 non-null float64 dtypes: category(8), float64(3), int32(1), int64(4), object(7) memory usage: 4.3+ MB
categorical_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 11323 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
categorical_df.name_family_status.nunique()
5
categorical_df.name_family_status.unique()
array(['Civil marriage', 'Married', 'Separated', 'Single / not married',
'Widow'], dtype=object)
categorical_df.name_family_status.value_counts()
Married 25048 Single / not married 4829 Civil marriage 2945 Separated 2103 Widow 1532 Name: name_family_status, dtype: int64
categorical_df.name_family_status.value_counts(normalize = True)
Married 0.69 Single / not married 0.13 Civil marriage 0.08 Separated 0.06 Widow 0.04 Name: name_family_status, dtype: float64
pd.crosstab(categorical_df['name_family_status'], categorical_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_family_status | |||
| Civil marriage | 2578 | 367 | 2945 |
| Married | 22134 | 2914 | 25048 |
| Separated | 1878 | 225 | 2103 |
| Single / not married | 4206 | 623 | 4829 |
| Widow | 1370 | 162 | 1532 |
| All | 32166 | 4291 | 36457 |
ms_total = categorical_df.groupby(by = ['name_family_status']).size().reset_index(name = 'times')
ms_total
| name_family_status | times | |
|---|---|---|
| 0 | Civil marriage | 2945 |
| 1 | Married | 25048 |
| 2 | Separated | 2103 |
| 3 | Single / not married | 4829 |
| 4 | Widow | 1532 |
ms_reject = categorical_df[categorical_df.status == 1].groupby(by = ['name_family_status']).size().reset_index(name = 'reject_times')
ms_reject
| name_family_status | reject_times | |
|---|---|---|
| 0 | Civil marriage | 367 |
| 1 | Married | 2914 |
| 2 | Separated | 225 |
| 3 | Single / not married | 623 |
| 4 | Widow | 162 |
ms_reject_rate = pd.merge(ms_total, ms_reject, how = 'outer', on = ['name_family_status']).fillna(0)
ms_reject_rate
| name_family_status | times | reject_times | |
|---|---|---|---|
| 0 | Civil marriage | 2945 | 367 |
| 1 | Married | 25048 | 2914 |
| 2 | Separated | 2103 | 225 |
| 3 | Single / not married | 4829 | 623 |
| 4 | Widow | 1532 | 162 |
ms_reject_rate['reject_rate'] = ms_reject_rate.reject_times / ms_reject_rate.times
ms_reject_rate
| name_family_status | times | reject_times | reject_rate | |
|---|---|---|---|---|
| 0 | Civil marriage | 2945 | 367 | 0.12 |
| 1 | Married | 25048 | 2914 | 0.12 |
| 2 | Separated | 2103 | 225 | 0.11 |
| 3 | Single / not married | 4829 | 623 | 0.13 |
| 4 | Widow | 1532 | 162 | 0.11 |
plt.subplots(figsize = (15, 8))
sns.barplot(x = "name_family_status", y = "reject_rate", data = ms_reject_rate)
plt.grid(False)
plt.show()
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
categorical_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category 22 employed_years 36457 non-null float64 dtypes: category(8), float64(3), int32(1), int64(4), object(7) memory usage: 4.3+ MB
categorical_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 11323 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
categorical_df.name_housing_type.nunique()
6
categorical_df.name_housing_type.unique()
array(['House / apartment', 'Office apartment', 'Rented apartment',
'Municipal apartment', 'With parents', 'Co-op apartment'],
dtype=object)
categorical_df.name_housing_type.value_counts()
House / apartment 32548 With parents 1776 Municipal apartment 1128 Rented apartment 575 Office apartment 262 Co-op apartment 168 Name: name_housing_type, dtype: int64
categorical_df.name_housing_type.value_counts(normalize = True)
House / apartment 0.89 With parents 0.05 Municipal apartment 0.03 Rented apartment 0.02 Office apartment 0.01 Co-op apartment 0.00 Name: name_housing_type, dtype: float64
pd.crosstab(categorical_df['name_housing_type'], categorical_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_housing_type | |||
| Co-op apartment | 154 | 14 | 168 |
| House / apartment | 28766 | 3782 | 32548 |
| Municipal apartment | 978 | 150 | 1128 |
| Office apartment | 224 | 38 | 262 |
| Rented apartment | 495 | 80 | 575 |
| With parents | 1549 | 227 | 1776 |
| All | 32166 | 4291 | 36457 |
h_total = categorical_df.groupby(by = ['name_housing_type']).size().reset_index(name = 'times')
h_total
| name_housing_type | times | |
|---|---|---|
| 0 | Co-op apartment | 168 |
| 1 | House / apartment | 32548 |
| 2 | Municipal apartment | 1128 |
| 3 | Office apartment | 262 |
| 4 | Rented apartment | 575 |
| 5 | With parents | 1776 |
h_reject = categorical_df[categorical_df.status == 1].groupby(by = ['name_housing_type']).size().reset_index(name = 'reject_times')
h_reject
| name_housing_type | reject_times | |
|---|---|---|
| 0 | Co-op apartment | 14 |
| 1 | House / apartment | 3782 |
| 2 | Municipal apartment | 150 |
| 3 | Office apartment | 38 |
| 4 | Rented apartment | 80 |
| 5 | With parents | 227 |
h_reject_rate = pd.merge(h_total, h_reject, how = 'outer', on = ['name_housing_type']).fillna(0)
h_reject_rate
| name_housing_type | times | reject_times | |
|---|---|---|---|
| 0 | Co-op apartment | 168 | 14 |
| 1 | House / apartment | 32548 | 3782 |
| 2 | Municipal apartment | 1128 | 150 |
| 3 | Office apartment | 262 | 38 |
| 4 | Rented apartment | 575 | 80 |
| 5 | With parents | 1776 | 227 |
h_reject_rate['reject_rate'] = h_reject_rate.reject_times / h_reject_rate.times
h_reject_rate
| name_housing_type | times | reject_times | reject_rate | |
|---|---|---|---|---|
| 0 | Co-op apartment | 168 | 14 | 0.08 |
| 1 | House / apartment | 32548 | 3782 | 0.12 |
| 2 | Municipal apartment | 1128 | 150 | 0.13 |
| 3 | Office apartment | 262 | 38 | 0.15 |
| 4 | Rented apartment | 575 | 80 | 0.14 |
| 5 | With parents | 1776 | 227 | 0.13 |
plt.subplots(figsize = (15, 10))
sns.barplot(y = "name_housing_type", x = "reject_rate", data = h_reject_rate, orient = 'h')
plt.grid(False)
plt.show()
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
categorical_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category 22 employed_years 36457 non-null float64 dtypes: category(8), float64(3), int32(1), int64(4), object(7) memory usage: 4.3+ MB
categorical_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 11323 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
categorical_df.occupation_type.nunique()
18
categorical_df.occupation_type.unique()
array(['Managers', 'Security staff', nan, 'Cleaning staff', 'Core staff',
'Laborers', 'Cooking staff', 'Medicine staff', 'Accountants',
'Sales staff', 'Low-skill Laborers', 'High skill tech staff',
'Drivers', 'Secretaries', 'Waiters/barmen staff',
'Private service staff', 'Realty agents', 'IT staff', 'HR staff'],
dtype=object)
categorical_df.occupation_type.value_counts()
Laborers 6211 Core staff 3591 Sales staff 3485 Managers 3012 Drivers 2138 High skill tech staff 1383 Accountants 1241 Medicine staff 1207 Cooking staff 655 Security staff 592 Cleaning staff 551 Private service staff 344 Low-skill Laborers 175 Waiters/barmen staff 174 Secretaries 151 HR staff 85 Realty agents 79 IT staff 60 Name: occupation_type, dtype: int64
categorical_df.occupation_type.value_counts().sum()
25134
# Verify the records to fill / replace
pensioner = categorical_df.loc[categorical_df.name_income_type == 'Pensioner'].loc[categorical_df.employed_years == 0] #.groupby(by = ['name_family_status']).size().reset_index(name = 'reject_times')
pensioner.count()
code_gender 6136 flag_own_car 6136 flag_own_realty 6136 cnt_children 6136 amt_income_total 6136 name_income_type 6136 name_education_type 6136 name_family_status 6136 name_housing_type 6136 days_birth 6136 days_employed 6136 flag_work_phone 6136 flag_phone 6136 flag_email 6136 occupation_type 1 cnt_fam_members 6136 status 6136 children_cnt_bucket 6136 cnt_fam_members_bucket 6136 income_bucket 6136 age 6136 age_bucket 6136 employed_years 6136 dtype: int64
pensioner.shape[0]
6136
pensioner.name_income_type.count()
6136
pensioner.employed_years.count()
6136
# By checking multiple conditions
categorical_df['occupation_type'] = np.where((categorical_df['name_income_type'] == 'Pensioner') & (categorical_df['employed_years'] == 0), 'Retired', categorical_df['occupation_type'])
categorical_df['occupation_type'].value_counts()
Laborers 6210 Retired 6136 Core staff 3591 Sales staff 3485 Managers 3012 Drivers 2138 High skill tech staff 1383 Accountants 1241 Medicine staff 1207 Cooking staff 655 Security staff 592 Cleaning staff 551 Private service staff 344 Low-skill Laborers 175 Waiters/barmen staff 174 Secretaries 151 HR staff 85 Realty agents 79 IT staff 60 Name: occupation_type, dtype: int64
categorical_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 5188 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
# Impute pending missing values by creating a new category 'Others' in the Occupation column
categorical_df['occupation_type'] = categorical_df['occupation_type'].fillna("Others")
# applications['OCCUPATION_TYPE'].fillna(value='Other', inplace=True)
categorical_df.occupation_type.value_counts()
Laborers 6210 Retired 6136 Others 5188 Core staff 3591 Sales staff 3485 Managers 3012 Drivers 2138 High skill tech staff 1383 Accountants 1241 Medicine staff 1207 Cooking staff 655 Security staff 592 Cleaning staff 551 Private service staff 344 Low-skill Laborers 175 Waiters/barmen staff 174 Secretaries 151 HR staff 85 Realty agents 79 IT staff 60 Name: occupation_type, dtype: int64
categorical_df.occupation_type.value_counts(normalize = True)
Laborers 0.17 Retired 0.17 Others 0.14 Core staff 0.10 Sales staff 0.10 Managers 0.08 Drivers 0.06 High skill tech staff 0.04 Accountants 0.03 Medicine staff 0.03 Cooking staff 0.02 Security staff 0.02 Cleaning staff 0.02 Private service staff 0.01 Low-skill Laborers 0.00 Waiters/barmen staff 0.00 Secretaries 0.00 HR staff 0.00 Realty agents 0.00 IT staff 0.00 Name: occupation_type, dtype: float64
categorical_df.isnull().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 cnt_children 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 days_birth 0 days_employed 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 0 cnt_fam_members 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 income_bucket 0 age 0 age_bucket 0 employed_years 0 dtype: int64
pd.crosstab(categorical_df['occupation_type'], categorical_df['status'], margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| occupation_type | |||
| Accountants | 1094 | 147 | 1241 |
| Cleaning staff | 488 | 63 | 551 |
| Cooking staff | 569 | 86 | 655 |
| Core staff | 3128 | 463 | 3591 |
| Drivers | 1874 | 264 | 2138 |
| HR staff | 71 | 14 | 85 |
| High skill tech staff | 1202 | 181 | 1383 |
| IT staff | 49 | 11 | 60 |
| Laborers | 5481 | 729 | 6210 |
| Low-skill Laborers | 142 | 33 | 175 |
| Managers | 2622 | 390 | 3012 |
| Medicine staff | 1044 | 163 | 1207 |
| Others | 4613 | 575 | 5188 |
| Private service staff | 322 | 22 | 344 |
| Realty agents | 69 | 10 | 79 |
| Retired | 5508 | 628 | 6136 |
| Sales staff | 3096 | 389 | 3485 |
| Secretaries | 138 | 13 | 151 |
| Security staff | 501 | 91 | 592 |
| Waiters/barmen staff | 155 | 19 | 174 |
| All | 32166 | 4291 | 36457 |
occ_total = categorical_df.groupby(by = ['occupation_type']).size().reset_index(name = 'times')
occ_total
| occupation_type | times | |
|---|---|---|
| 0 | Accountants | 1241 |
| 1 | Cleaning staff | 551 |
| 2 | Cooking staff | 655 |
| 3 | Core staff | 3591 |
| 4 | Drivers | 2138 |
| 5 | HR staff | 85 |
| 6 | High skill tech staff | 1383 |
| 7 | IT staff | 60 |
| 8 | Laborers | 6210 |
| 9 | Low-skill Laborers | 175 |
| 10 | Managers | 3012 |
| 11 | Medicine staff | 1207 |
| 12 | Others | 5188 |
| 13 | Private service staff | 344 |
| 14 | Realty agents | 79 |
| 15 | Retired | 6136 |
| 16 | Sales staff | 3485 |
| 17 | Secretaries | 151 |
| 18 | Security staff | 592 |
| 19 | Waiters/barmen staff | 174 |
occ_reject = categorical_df[categorical_df.status == 1].groupby(by = ['occupation_type']).size().reset_index(name = 'reject_times')
occ_reject
| occupation_type | reject_times | |
|---|---|---|
| 0 | Accountants | 147 |
| 1 | Cleaning staff | 63 |
| 2 | Cooking staff | 86 |
| 3 | Core staff | 463 |
| 4 | Drivers | 264 |
| 5 | HR staff | 14 |
| 6 | High skill tech staff | 181 |
| 7 | IT staff | 11 |
| 8 | Laborers | 729 |
| 9 | Low-skill Laborers | 33 |
| 10 | Managers | 390 |
| 11 | Medicine staff | 163 |
| 12 | Others | 575 |
| 13 | Private service staff | 22 |
| 14 | Realty agents | 10 |
| 15 | Retired | 628 |
| 16 | Sales staff | 389 |
| 17 | Secretaries | 13 |
| 18 | Security staff | 91 |
| 19 | Waiters/barmen staff | 19 |
occ_reject_rate = pd.merge(occ_total, occ_reject, how = 'outer', on = ['occupation_type']).fillna(0)
occ_reject_rate
| occupation_type | times | reject_times | |
|---|---|---|---|
| 0 | Accountants | 1241 | 147 |
| 1 | Cleaning staff | 551 | 63 |
| 2 | Cooking staff | 655 | 86 |
| 3 | Core staff | 3591 | 463 |
| 4 | Drivers | 2138 | 264 |
| 5 | HR staff | 85 | 14 |
| 6 | High skill tech staff | 1383 | 181 |
| 7 | IT staff | 60 | 11 |
| 8 | Laborers | 6210 | 729 |
| 9 | Low-skill Laborers | 175 | 33 |
| 10 | Managers | 3012 | 390 |
| 11 | Medicine staff | 1207 | 163 |
| 12 | Others | 5188 | 575 |
| 13 | Private service staff | 344 | 22 |
| 14 | Realty agents | 79 | 10 |
| 15 | Retired | 6136 | 628 |
| 16 | Sales staff | 3485 | 389 |
| 17 | Secretaries | 151 | 13 |
| 18 | Security staff | 592 | 91 |
| 19 | Waiters/barmen staff | 174 | 19 |
occ_reject_rate['reject_rate'] = occ_reject_rate.reject_times / occ_reject_rate.times
occ_reject_rate
| occupation_type | times | reject_times | reject_rate | |
|---|---|---|---|---|
| 0 | Accountants | 1241 | 147 | 0.12 |
| 1 | Cleaning staff | 551 | 63 | 0.11 |
| 2 | Cooking staff | 655 | 86 | 0.13 |
| 3 | Core staff | 3591 | 463 | 0.13 |
| 4 | Drivers | 2138 | 264 | 0.12 |
| 5 | HR staff | 85 | 14 | 0.16 |
| 6 | High skill tech staff | 1383 | 181 | 0.13 |
| 7 | IT staff | 60 | 11 | 0.18 |
| 8 | Laborers | 6210 | 729 | 0.12 |
| 9 | Low-skill Laborers | 175 | 33 | 0.19 |
| 10 | Managers | 3012 | 390 | 0.13 |
| 11 | Medicine staff | 1207 | 163 | 0.14 |
| 12 | Others | 5188 | 575 | 0.11 |
| 13 | Private service staff | 344 | 22 | 0.06 |
| 14 | Realty agents | 79 | 10 | 0.13 |
| 15 | Retired | 6136 | 628 | 0.10 |
| 16 | Sales staff | 3485 | 389 | 0.11 |
| 17 | Secretaries | 151 | 13 | 0.09 |
| 18 | Security staff | 592 | 91 | 0.15 |
| 19 | Waiters/barmen staff | 174 | 19 | 0.11 |
plt.subplots(figsize = (30, 20))
sns.barplot(x = "reject_rate", y = "occupation_type", data = occ_reject_rate, orient = 'h')
plt.grid(False)
plt.show()
categorical_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 36457 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category 22 employed_years 36457 non-null float64 dtypes: category(8), float64(3), int32(1), int64(4), object(7) memory usage: 4.3+ MB
categorical_df = categorical_df.drop(['days_birth', 'days_employed'], axis = 1)
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | Y | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | ... | N | Retired | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | ... | N | Retired | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 21 columns
categorical_df.shape
(36457, 21)
# Drop the more variables:-
categorical_df = categorical_df.drop(['cnt_children', 'cnt_fam_members', 'income_bucket', 'age_bucket'], axis = 1)
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | children_cnt_bucket | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 1 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 2 | F | N | Y | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | Y | N | N | Security staff | 0 | None | Two | 52 | 3.00 |
| 3 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
| 4 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
categorical_df.shape
(36457, 17)
cleaned_df = categorical_df.copy()
cleaned_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | children_cnt_bucket | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 1 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 2 | F | N | Y | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | Y | N | N | Security staff | 0 | None | Two | 52 | 3.00 |
| 3 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
| 4 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
# Save the cleaned EDA dataset
cleaned_df.to_csv('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\cleaned_df_final.csv', index = False)
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
# Distribution Plot
plt.subplots(figsize = (12, 8))
sns.distplot(continuous_df['cnt_children'], hist = True, kde = True)
plt.title('Histogram-cum-Density Plot of Children Count\n')
plt.xlabel('\nTotal Children')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# Distribution Plot
plt.subplots(figsize = (12, 8))
sns.distplot(continuous_df['amt_income_total'], hist = True, kde = True)
plt.title('Histogram-cum-Density Plot of Total Income\n')
plt.xlabel('\nTotal Income')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# Distribution Plot
plt.subplots(figsize = (12, 8))
sns.distplot(continuous_df['cnt_fam_members'], hist = True, kde = True)
plt.title('Histogram-cum-Density Plot of Family Members Count\n')
plt.xlabel('\nTotal Family Members')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# Distribution Plot
plt.subplots(figsize = (12, 8))
sns.distplot(continuous_df['age'], hist = True, kde = True)
plt.title('Histogram-cum-Density Plot of Age\n')
plt.xlabel('\nAge')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# Distribution Plot
plt.subplots(figsize = (12, 8))
sns.distplot(continuous_df['employed_years'], hist = True, kde = True)
plt.title('Histogram-cum-Density Plot of Years of Current Employment\n')
plt.xlabel('\nYears of Current Employment')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# Distribution Plot
plt.subplots(figsize = (12, 8))
sns.distplot(continuous_df['status'], hist = True, kde = True)
plt.title('Histogram-cum-Density Plot of Eligibility\n')
plt.xlabel('\nEligible Vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | children_cnt_bucket | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 1 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 2 | F | N | Y | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | Y | N | N | Security staff | 0 | None | Two | 52 | 3.00 |
| 3 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
| 4 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
# Bar Plot
categorical_df['code_gender'].value_counts(normalize = True).sort_index().plot.bar()
plt.title('Bar Plot of Gender Distribution\n')
plt.xlabel('\nGender')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
categorical_df['flag_own_car'].value_counts(normalize = True).sort_index().plot.bar()
plt.title('Bar Plot of Cars Ownership\n')
plt.xlabel('\nCar')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
categorical_df['flag_own_realty'].value_counts(normalize = True).sort_index().plot.bar()
plt.title('Bar Plot of Property Ownership\n')
plt.xlabel('\nProperty')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['name_income_type'].value_counts(normalize = True).sort_values(ascending=False).plot.bar()
plt.title('Bar Plot of Income Type\n')
plt.xlabel('\nIncome Type')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['name_education_type'].value_counts(normalize = True).sort_values(ascending=False).plot.bar()
plt.title('Bar Plot of Education Type\n')
plt.xlabel('\nEducation Type')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['name_family_status'].value_counts(normalize = True).sort_values(ascending=False).plot.bar()
plt.title('Bar Plot of Marital Status\n')
plt.xlabel('\nMarital Type')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['name_housing_type'].value_counts(normalize = True).sort_values(ascending=False).plot.bar()
plt.title('Bar Plot of Housing Type\n')
plt.xlabel('\nHousing Type')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.xticks(rotation = 0, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['occupation_type'].value_counts(normalize = True).sort_values(ascending=True).plot.barh()
plt.title('Bar Plot of Occupation Type\n')
plt.xlabel('\nOccupation Type')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.xticks(rotation = 0, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.grid(False)
# Bar Plot
#plt.subplots(figsize = (16, 8))
categorical_df['flag_work_phone'].value_counts(normalize = True).sort_index().plot.bar()
plt.title('Bar Plot of Work Phone\n')
plt.xlabel('\nWork Phone')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
#plt.subplots(figsize = (16, 8))
categorical_df['flag_phone'].value_counts(normalize = True).sort_index().plot.bar()
plt.title('Bar Plot of Phone\n')
plt.xlabel('\nPhone')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
#plt.subplots(figsize = (16, 8))
categorical_df['flag_email'].value_counts(normalize = True).sort_index().plot.bar()
plt.title('Bar Plot of Email\n')
plt.xlabel('\nEmail')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['children_cnt_bucket'].value_counts(normalize = True).sort_values(ascending=False).plot.bar()
plt.title('Bar Plot of Children Count Bucket\n')
plt.xlabel('\nChildren Count Bucket')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['cnt_fam_members_bucket'].value_counts(normalize = True).sort_values(ascending=False).plot.bar()
plt.title('Bar Plot of Family Members Count Bucket\n')
plt.xlabel('\nFamily Members Count Bucket')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.xticks(rotation = 0, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.grid(False)
# Bar Plot
plt.subplots(figsize = (16, 8))
categorical_df['status'].value_counts(normalize = True).sort_values(ascending=False).plot.bar()
plt.title('Bar Plot of Eligibility\n')
plt.xlabel('\nEligible vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.xticks(rotation = 0)
plt.xticks(rotation = 0, fontsize = 16)
plt.yticks(rotation = 0, fontsize = 16)
plt.grid(False)
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
# Correlation
continuous_df[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years', 'status']].corr()
| cnt_children | amt_income_total | cnt_fam_members | age | employed_years | status | |
|---|---|---|---|---|---|---|
| cnt_children | 1.00 | 0.03 | 0.89 | -0.34 | 0.04 | 0.01 |
| amt_income_total | 0.03 | 1.00 | 0.02 | -0.07 | 0.08 | 0.02 |
| cnt_fam_members | 0.89 | 0.02 | 1.00 | -0.30 | 0.05 | 0.01 |
| age | -0.34 | -0.07 | -0.30 | 1.00 | -0.01 | -0.03 |
| employed_years | 0.04 | 0.08 | 0.05 | -0.01 | 1.00 | 0.00 |
| status | 0.01 | 0.02 | 0.01 | -0.03 | 0.00 | 1.00 |
# Scatter plot to view the correlation pattern
sns.scatterplot(continuous_df.cnt_children, continuous_df.cnt_fam_members)
#plt.ylim(0,25)
plt.grid(False)
plt.show()
# KDE Plot
plt.subplots(figsize = (12, 8))
sns.kdeplot(data = continuous_df, x = 'cnt_children', hue = 'status', fill = True)
plt.title('KDE Plot of Children Count with Eligibility\n')
plt.xlabel('\nChildren Count')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# KDE Plot
plt.subplots(figsize = (12, 8))
sns.kdeplot(data = continuous_df, x = 'amt_income_total', hue = 'status', fill = True)
plt.title('KDE Plot of Total Income with Eligibility\n')
plt.xlabel('\nTotal Income')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# KDE Plot
plt.subplots(figsize = (12, 8))
sns.kdeplot(data = continuous_df, x = 'cnt_fam_members', hue = 'status', fill = True)
plt.title('KDE Plot of Family Members Count with Eligibility\n')
plt.xlabel('\nFamily Members Count')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# KDE Plot
plt.subplots(figsize = (12, 8))
sns.kdeplot(data = continuous_df, x = 'age', hue = 'status', fill = True)
plt.title('KDE Plot of Age with Eligibility\n')
plt.xlabel('\nAge')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
# KDE Plot
plt.subplots(figsize = (12, 8))
sns.kdeplot(data = continuous_df, x = 'employed_years', hue = 'status', fill = True)
plt.title('KDE Plot of Years of Current Employment with Eligibility\n')
plt.xlabel('\nFamily Members Count')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
categorical_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | children_cnt_bucket | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 1 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 2 | F | N | Y | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | Y | N | N | Security staff | 0 | None | Two | 52 | 3.00 |
| 3 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
| 4 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
# Group-by
continuous_df.groupby(by = 'status').agg('mean')[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']]
| cnt_children | amt_income_total | cnt_fam_members | age | employed_years | |
|---|---|---|---|---|---|
| status | |||||
| 0 | 0.43 | 185785.99 | 2.20 | 43.35 | 5.60 |
| 1 | 0.45 | 193430.41 | 2.22 | 42.31 | 5.69 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'code_gender')
plt.title('Count Plot of Gender with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.code_gender, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| code_gender | |||
| F | 21672 | 2758 | 24430 |
| M | 10494 | 1533 | 12027 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.code_gender, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.code_gender, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| code_gender | |||
| F | 0.89 | 0.11 | 1.00 |
| M | 0.87 | 0.13 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'flag_own_car')
plt.title('Count Plot of Cars with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.flag_own_car, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_car | |||
| N | 19892 | 2722 | 22614 |
| Y | 12274 | 1569 | 13843 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.flag_own_car, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.flag_own_car, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_car | |||
| N | 0.88 | 0.12 | 1.00 |
| Y | 0.89 | 0.11 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'flag_own_realty')
plt.title('Count Plot of Property with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.flag_own_realty, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_realty | |||
| N | 10390 | 1561 | 11951 |
| Y | 21776 | 2730 | 24506 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.flag_own_realty, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.flag_own_realty, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| flag_own_realty | |||
| N | 0.87 | 0.13 | 1.00 |
| Y | 0.89 | 0.11 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'name_income_type')
plt.title('Count Plot of Income Type with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.name_income_type, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_income_type | |||
| Commercial associate | 7410 | 1080 | 8490 |
| Pensioner | 5508 | 644 | 6152 |
| State servant | 2600 | 385 | 2985 |
| Student | 10 | 1 | 11 |
| Working | 16638 | 2181 | 18819 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.name_income_type, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.name_income_type, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| name_income_type | |||
| Commercial associate | 0.87 | 0.13 | 1.00 |
| Pensioner | 0.90 | 0.10 | 1.00 |
| State servant | 0.87 | 0.13 | 1.00 |
| Student | 0.91 | 0.09 | 1.00 |
| Working | 0.88 | 0.12 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'name_education_type')
plt.title('Count Plot of Education Type with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.name_education_type, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_education_type | |||
| Academic degree | 25 | 7 | 32 |
| Higher education | 8716 | 1148 | 9864 |
| Incomplete higher | 1203 | 207 | 1410 |
| Lower secondary | 335 | 39 | 374 |
| Secondary / secondary special | 21887 | 2890 | 24777 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.name_education_type, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.name_education_type, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| name_education_type | |||
| Academic degree | 0.78 | 0.22 | 1.00 |
| Higher education | 0.88 | 0.12 | 1.00 |
| Incomplete higher | 0.85 | 0.15 | 1.00 |
| Lower secondary | 0.90 | 0.10 | 1.00 |
| Secondary / secondary special | 0.88 | 0.12 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Group-by
continuous_df.groupby(by = 'status').agg('mean')[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']]
| cnt_children | amt_income_total | cnt_fam_members | age | employed_years | |
|---|---|---|---|---|---|
| status | |||||
| 0 | 0.43 | 185785.99 | 2.20 | 43.35 | 5.60 |
| 1 | 0.45 | 193430.41 | 2.22 | 42.31 | 5.69 |
# Now filter rows by Academic degree
continuous_df[continuous_df.name_education_type == 'Academic degree'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 0.72 amt_income_total 247500.00 cnt_fam_members 2.47 age 42.19 employed_years 4.69 dtype: float64
# Now filter rows by Lower secondary
continuous_df[continuous_df.name_education_type == 'Lower secondary'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 0.32 amt_income_total 141659.76 cnt_fam_members 2.13 age 47.83 employed_years 3.07 dtype: float64
From the above observation it is not clear that why the applicant with an Academic degree education has a higher rejection rate than the applicant with a lower secondary education.
Almost on all the counts the applicant with an Academic degree education served better than the applicant with a lower secondary education.
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'name_family_status')
plt.title('Count Plot of Marital Status with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.name_family_status, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_family_status | |||
| Civil marriage | 2578 | 367 | 2945 |
| Married | 22134 | 2914 | 25048 |
| Separated | 1878 | 225 | 2103 |
| Single / not married | 4206 | 623 | 4829 |
| Widow | 1370 | 162 | 1532 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.name_family_status, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.name_family_status, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| name_family_status | |||
| Civil marriage | 0.88 | 0.12 | 1.00 |
| Married | 0.88 | 0.12 | 1.00 |
| Separated | 0.89 | 0.11 | 1.00 |
| Single / not married | 0.87 | 0.13 | 1.00 |
| Widow | 0.89 | 0.11 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'name_housing_type')
plt.title('Count Plot of Housing Type with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.name_housing_type, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| name_housing_type | |||
| Co-op apartment | 154 | 14 | 168 |
| House / apartment | 28766 | 3782 | 32548 |
| Municipal apartment | 978 | 150 | 1128 |
| Office apartment | 224 | 38 | 262 |
| Rented apartment | 495 | 80 | 575 |
| With parents | 1549 | 227 | 1776 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.name_housing_type, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.name_housing_type, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| name_housing_type | |||
| Co-op apartment | 0.92 | 0.08 | 1.00 |
| House / apartment | 0.88 | 0.12 | 1.00 |
| Municipal apartment | 0.87 | 0.13 | 1.00 |
| Office apartment | 0.85 | 0.15 | 1.00 |
| Rented apartment | 0.86 | 0.14 | 1.00 |
| With parents | 0.87 | 0.13 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'flag_work_phone')
plt.title('Count Plot of Work Phone with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.flag_work_phone, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_work_phone | |||
| N | 24911 | 3324 | 28235 |
| Y | 7255 | 967 | 8222 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.flag_work_phone, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.flag_work_phone, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| flag_work_phone | |||
| N | 0.88 | 0.12 | 1.00 |
| Y | 0.88 | 0.12 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'flag_phone')
plt.title('Count Plot of Phone with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.flag_phone, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_phone | |||
| N | 22649 | 3060 | 25709 |
| Y | 9517 | 1231 | 10748 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.flag_phone, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.flag_phone, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| flag_phone | |||
| N | 0.88 | 0.12 | 1.00 |
| Y | 0.89 | 0.11 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'flag_email')
plt.title('Count Plot of Email with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.flag_email, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| flag_email | |||
| N | 29346 | 3840 | 33186 |
| Y | 2820 | 451 | 3271 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.flag_email, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.flag_email, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| flag_email | |||
| N | 0.88 | 0.12 | 1.00 |
| Y | 0.86 | 0.14 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Count Plot
plt.subplots(figsize = (14, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'occupation_type')
plt.title('Count Plot of Occupation Type with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.legend(bbox_to_anchor = (1.01, 1), loc = 'upper left', borderaxespad = 0, fontsize = 12,
title = "Occupation Type", title_fontsize = 14)
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.occupation_type, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| occupation_type | |||
| Accountants | 1094 | 147 | 1241 |
| Cleaning staff | 488 | 63 | 551 |
| Cooking staff | 569 | 86 | 655 |
| Core staff | 3128 | 463 | 3591 |
| Drivers | 1874 | 264 | 2138 |
| HR staff | 71 | 14 | 85 |
| High skill tech staff | 1202 | 181 | 1383 |
| IT staff | 49 | 11 | 60 |
| Laborers | 5481 | 729 | 6210 |
| Low-skill Laborers | 142 | 33 | 175 |
| Managers | 2622 | 390 | 3012 |
| Medicine staff | 1044 | 163 | 1207 |
| Others | 4613 | 575 | 5188 |
| Private service staff | 322 | 22 | 344 |
| Realty agents | 69 | 10 | 79 |
| Retired | 5508 | 628 | 6136 |
| Sales staff | 3096 | 389 | 3485 |
| Secretaries | 138 | 13 | 151 |
| Security staff | 501 | 91 | 592 |
| Waiters/barmen staff | 155 | 19 | 174 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.occupation_type, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.occupation_type, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| occupation_type | |||
| Accountants | 0.88 | 0.12 | 1.00 |
| Cleaning staff | 0.89 | 0.11 | 1.00 |
| Cooking staff | 0.87 | 0.13 | 1.00 |
| Core staff | 0.87 | 0.13 | 1.00 |
| Drivers | 0.88 | 0.12 | 1.00 |
| HR staff | 0.84 | 0.16 | 1.00 |
| High skill tech staff | 0.87 | 0.13 | 1.00 |
| IT staff | 0.82 | 0.18 | 1.00 |
| Laborers | 0.88 | 0.12 | 1.00 |
| Low-skill Laborers | 0.81 | 0.19 | 1.00 |
| Managers | 0.87 | 0.13 | 1.00 |
| Medicine staff | 0.86 | 0.14 | 1.00 |
| Others | 0.89 | 0.11 | 1.00 |
| Private service staff | 0.94 | 0.06 | 1.00 |
| Realty agents | 0.87 | 0.13 | 1.00 |
| Retired | 0.90 | 0.10 | 1.00 |
| Sales staff | 0.89 | 0.11 | 1.00 |
| Secretaries | 0.91 | 0.09 | 1.00 |
| Security staff | 0.85 | 0.15 | 1.00 |
| Waiters/barmen staff | 0.89 | 0.11 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Group-by
continuous_df.groupby(by = 'status').agg('mean')[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']]
| cnt_children | amt_income_total | cnt_fam_members | age | employed_years | |
|---|---|---|---|---|---|
| status | |||||
| 0 | 0.43 | 185785.99 | 2.20 | 43.35 | 5.60 |
| 1 | 0.45 | 193430.41 | 2.22 | 42.31 | 5.69 |
# Now filter rows by Low-skill Laborers
continuous_df[continuous_df.occupation_type == 'Low-skill Laborers'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 0.44 amt_income_total 133920.00 cnt_fam_members 2.34 age 40.94 employed_years 5.51 dtype: float64
# Now filter rows by IT staff
continuous_df[continuous_df.occupation_type == 'IT staff'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 0.37 amt_income_total 199860.00 cnt_fam_members 2.12 age 34.67 employed_years 3.65 dtype: float64
# Now filter rows by HR staff
continuous_df[continuous_df.occupation_type == 'HR staff'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 0.45 amt_income_total 193764.71 cnt_fam_members 2.00 age 38.14 employed_years 4.84 dtype: float64
# Now filter rows by Security staff
continuous_df[continuous_df.occupation_type == 'Security staff'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 0.40 amt_income_total 177037.75 cnt_fam_members 2.19 age 45.30 employed_years 4.55 dtype: float64
# Now filter rows by Medicine staff
continuous_df[continuous_df.occupation_type == 'Medicine staff'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 0.44 amt_income_total 166114.62 cnt_fam_members 2.22 age 42.96 employed_years 11.11 dtype: float64
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'children_cnt_bucket')
plt.title('Count Plot of Children Count Bucket with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.legend(bbox_to_anchor = (1.01, 1), loc = 'upper left', borderaxespad = 0, fontsize = 12,
title = "Occupation Type", title_fontsize = 14)
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.children_cnt_bucket, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| children_cnt_bucket | |||
| Five | 20 | 0 | 20 |
| Four | 58 | 5 | 63 |
| More than Five | 2 | 4 | 6 |
| None | 22259 | 2942 | 25201 |
| One | 6642 | 850 | 7492 |
| Three | 364 | 55 | 419 |
| Two | 2821 | 435 | 3256 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.children_cnt_bucket, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.children_cnt_bucket, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| children_cnt_bucket | |||
| Five | 1.00 | 0.00 | 1.00 |
| Four | 0.92 | 0.08 | 1.00 |
| More than Five | 0.33 | 0.67 | 1.00 |
| None | 0.88 | 0.12 | 1.00 |
| One | 0.89 | 0.11 | 1.00 |
| Three | 0.87 | 0.13 | 1.00 |
| Two | 0.87 | 0.13 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Group-by
continuous_df.groupby(by = 'status').agg('mean')[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']]
| cnt_children | amt_income_total | cnt_fam_members | age | employed_years | |
|---|---|---|---|---|---|
| status | |||||
| 0 | 0.43 | 185785.99 | 2.20 | 43.35 | 5.60 |
| 1 | 0.45 | 193430.41 | 2.22 | 42.31 | 5.69 |
# Now filter rows by More than Five
continuous_df[continuous_df.children_cnt_bucket == 'More than Five'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 12.50 amt_income_total 183750.00 cnt_fam_members 13.83 age 41.33 employed_years 4.17 dtype: float64
# Count Plot
plt.subplots(figsize = (12, 8))
sns.countplot(data = categorical_df, x = 'status', hue = 'cnt_fam_members_bucket')
plt.title('Count Plot of Family Members Count Bucket with Eligibility\n')
plt.xlabel('\nEligibile vs Non-Eligible')
plt.ylabel('Percentage\n')
plt.legend(bbox_to_anchor = (1.01, 1), loc = 'upper left', borderaxespad = 0, fontsize = 12,
title = "Occupation Type", title_fontsize = 14)
plt.grid(False)
plt.show()
pd.crosstab(categorical_df.cnt_fam_members_bucket, categorical_df.status, margins = True)
| status | 0 | 1 | All |
|---|---|---|---|
| cnt_fam_members_bucket | |||
| Five | 349 | 48 | 397 |
| Four | 2683 | 423 | 3106 |
| More than Seven | 2 | 4 | 6 |
| One | 6162 | 825 | 6987 |
| Seven | 19 | 0 | 19 |
| Six | 53 | 5 | 58 |
| Three | 5698 | 723 | 6421 |
| Two | 17200 | 2263 | 19463 |
| All | 32166 | 4291 | 36457 |
all = pd.crosstab(categorical_df.cnt_fam_members_bucket, categorical_df.status, margins = True)['All']
pd.crosstab(categorical_df.cnt_fam_members_bucket, categorical_df.status, margins = True).divide(all, axis = 0).dropna()
| status | 0 | 1 | All |
|---|---|---|---|
| cnt_fam_members_bucket | |||
| Five | 0.88 | 0.12 | 1.00 |
| Four | 0.86 | 0.14 | 1.00 |
| More than Seven | 0.33 | 0.67 | 1.00 |
| One | 0.88 | 0.12 | 1.00 |
| Seven | 1.00 | 0.00 | 1.00 |
| Six | 0.91 | 0.09 | 1.00 |
| Three | 0.89 | 0.11 | 1.00 |
| Two | 0.88 | 0.12 | 1.00 |
| All | 0.88 | 0.12 | 1.00 |
# Group-by
continuous_df.groupby(by = 'status').agg('mean')[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']]
| cnt_children | amt_income_total | cnt_fam_members | age | employed_years | |
|---|---|---|---|---|---|
| status | |||||
| 0 | 0.43 | 185785.99 | 2.20 | 43.35 | 5.60 |
| 1 | 0.45 | 193430.41 | 2.22 | 42.31 | 5.69 |
# Now filter rows by More than Seven
continuous_df[continuous_df.cnt_fam_members_bucket == 'More than Seven'][['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']].agg('mean')
cnt_children 12.50 amt_income_total 183750.00 cnt_fam_members 13.83 age 41.33 employed_years 4.17 dtype: float64
continuous_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
# Scatter plot
plt.subplots(figsize = (12, 8))
sns.scatterplot(continuous_df.cnt_children, continuous_df.amt_income_total)
#plt.ylim(0,25)
plt.title('Scatter Plot of Children Count with Total Income\n')
plt.xlabel('\nTotal Children Count')
plt.ylabel('Total Income\n')
plt.grid(False)
plt.show()
# Scatter plot
plt.subplots(figsize = (12, 8))
sns.scatterplot(continuous_df.cnt_fam_members, continuous_df.amt_income_total)
#plt.ylim(0,25)
plt.title('Scatter Plot of Family Members Count with Total Income\n')
plt.xlabel('\nTotal Family Members Count')
plt.ylabel('Total Income\n')
plt.grid(False)
plt.show()
# Scatter plot
plt.subplots(figsize = (12, 8))
sns.scatterplot(continuous_df.age, continuous_df.amt_income_total)
#plt.ylim(0,25)
plt.title('Scatter Plot of Age with Total Income\n')
plt.xlabel('\nAge')
plt.ylabel('Total Income\n')
plt.grid(False)
plt.show()
# Scatter plot
plt.subplots(figsize = (12, 8))
sns.scatterplot(continuous_df.employed_years, continuous_df.amt_income_total)
#plt.ylim(0,25)
plt.title('Scatter Plot of Years of Employment with Total Income\n')
plt.xlabel('\nYears of Employment')
plt.ylabel('Total Income\n')
plt.grid(False)
plt.show()
# PairGrid
g = sns.PairGrid(data = continuous_df[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years', 'status']],
hue = 'status', size = 2.5, palette='RdBu_r')
g.map(plt.scatter, alpha=0.8)
g.add_legend()
<seaborn.axisgrid.PairGrid at 0x219ec1771c0>
boxplot_df = continuous_df.copy()
boxplot_df.head()
| code_gender | flag_own_car | flag_own_realty | cnt_children | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | days_birth | ... | flag_email | occupation_type | cnt_fam_members | status | children_cnt_bucket | cnt_fam_members_bucket | income_bucket | age | age_bucket | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 1 | F | N | N | 0 | 27000.00 | Working | Higher education | Civil marriage | House / apartment | 14869 | ... | N | Managers | 2.00 | 1 | None | Two | Very_low | 40 | Middle_Age | 13.00 |
| 2 | F | N | Y | 0 | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | 19128 | ... | N | Security staff | 2.00 | 0 | None | Two | Very_low | 52 | Middle_Age | 3.00 |
| 3 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
| 4 | F | Y | Y | 0 | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | 20086 | ... | N | NaN | 2.00 | 0 | None | Two | Very_low | 54 | Middle_Age | 0.00 |
5 rows × 23 columns
boxplot_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null category 1 flag_own_car 36457 non-null category 2 flag_own_realty 36457 non-null category 3 cnt_children 36457 non-null int64 4 amt_income_total 36457 non-null float64 5 name_income_type 36457 non-null object 6 name_education_type 36457 non-null object 7 name_family_status 36457 non-null object 8 name_housing_type 36457 non-null object 9 days_birth 36457 non-null int64 10 days_employed 36457 non-null int64 11 flag_work_phone 36457 non-null category 12 flag_phone 36457 non-null category 13 flag_email 36457 non-null category 14 occupation_type 25134 non-null object 15 cnt_fam_members 36457 non-null float64 16 status 36457 non-null int64 17 children_cnt_bucket 36457 non-null object 18 cnt_fam_members_bucket 36457 non-null object 19 income_bucket 36457 non-null category 20 age 36457 non-null int32 21 age_bucket 36457 non-null category 22 employed_years 36457 non-null float64 dtypes: category(8), float64(3), int32(1), int64(4), object(7) memory usage: 4.3+ MB
numerical_col2 = boxplot_df[['cnt_children', 'amt_income_total', 'cnt_fam_members', 'age', 'employed_years']]
numerical_col2.head()
| cnt_children | amt_income_total | cnt_fam_members | age | employed_years | |
|---|---|---|---|---|---|
| 0 | 0 | 27000.00 | 2.00 | 40 | 13.00 |
| 1 | 0 | 27000.00 | 2.00 | 40 | 13.00 |
| 2 | 0 | 27000.00 | 2.00 | 52 | 3.00 |
| 3 | 0 | 29250.00 | 2.00 | 54 | 0.00 |
| 4 | 0 | 29250.00 | 2.00 | 54 | 0.00 |
fig , axes = plt.subplots(nrows = 3, ncols = 2, constrained_layout = True)
fig.subplots_adjust(left = 0, bottom = 0, right = 3, top = 5, wspace = 0.09, hspace = 0.3)
for ax, column in zip(axes.flatten(), numerical_col2):
sns.boxplot(numerical_col2[column], ax = ax)
plt.grid(False)
fig.delaxes(axes[2][1])
plt.show()
from wordcloud import WordCloud
nlp_df = cleaned_df.copy()
nlp_df.isna().sum()
code_gender 0 flag_own_car 0 flag_own_realty 0 amt_income_total 0 name_income_type 0 name_education_type 0 name_family_status 0 name_housing_type 0 flag_work_phone 0 flag_phone 0 flag_email 0 occupation_type 0 status 0 children_cnt_bucket 0 cnt_fam_members_bucket 0 age 0 employed_years 0 dtype: int64
# Create the an object fields and store the categorical columns in it
fields = ['name_income_type', 'name_education_type', 'name_family_status', 'name_housing_type', 'occupation_type']
# Read the dataset (use the cleaned EDA dataset)
text = pd.read_csv('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\cleaned_df_final.csv', usecols = fields)
inc_text = ' '.join(text['name_income_type'])
# Creating word_cloud with text as argument in .generate() method
word_cloud_inc = WordCloud(collocations = False, background_color = 'white').generate(inc_text)
# Generate plot
plt.figure(figsize = (15, 8))
plt.imshow(word_cloud_inc)
plt.title('name_income_type', fontsize = 30)
plt.axis("off")
plt.show()
edu_text = ' '.join(text['name_education_type'])
# Creating word_cloud with text as argument in .generate() method
word_cloud_edu = WordCloud(collocations = False, background_color = 'white').generate(edu_text)
# Generate plot
plt.figure(figsize = (15, 8))
plt.imshow(word_cloud_edu)
plt.title('name_education_type', fontsize = 30)
plt.axis("off")
plt.show()
fam_text = ' '.join(text['name_family_status'])
# Creating word_cloud with text as argument in .generate() method
word_cloud_fam = WordCloud(collocations = False, background_color = 'white').generate(fam_text)
# Generate plot
plt.figure(figsize = (15, 8))
plt.imshow(word_cloud_fam)
plt.title('name_family_status', fontsize = 30)
plt.axis("off")
plt.show()
house_text = ' '.join(text['name_housing_type'])
# Creating word_cloud with text as argument in .generate() method
word_cloud_house = WordCloud(collocations = False, background_color = 'white').generate(house_text)
# Generate plot
plt.figure(figsize = (15, 8))
plt.imshow(word_cloud_house)
plt.title('name_housing_type', fontsize = 30)
plt.axis("off")
plt.show()
occ_text = ' '.join(text['occupation_type'])
# Creating word_cloud with text as argument in .generate() method
word_cloud_occ = WordCloud(collocations = False, background_color = 'white').generate(occ_text)
# Generate plot
plt.figure(figsize = (15, 8))
plt.imshow(word_cloud_occ)
plt.title('occupation_type', fontsize = 30)
plt.axis("off")
plt.show()
# Correlation of cleaned dataset categorical_df after EDA
plt.figure(figsize = (8, 8), dpi = 80, facecolor = 'white', edgecolor = 'k')
sns.set(font_scale = 2)
hm_corr = sns.heatmap(cleaned_df.corr(), annot = True, vmin = -1, vmax = 1, cmap = 'coolwarm', fmt = '.2f',
cbar_kws = {"shrink": .82, 'label': 'Correlation %'},
annot_kws = {"size": 18}, linewidths = 0.1, linecolor = 'white', square = True)
plt.title('Correlation matrix of Cleaned Data (cleaned_df)\n')
hm_corr.set(xlabel = '\nApplicants Details', ylabel = 'Applicants Details\n')
hm_corr.set_xticklabels(hm_corr.get_xmajorticklabels(), fontsize = 12, rotation = 45)
hm_corr.set_yticklabels(hm_corr.get_ymajorticklabels(), fontsize = 12)
plt.savefig('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\corr_matrix_eda2.jpg')
plt.show()
encoding_df = cleaned_df.copy()
encoding_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | children_cnt_bucket | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 1 | F | N | N | 27000.00 | Working | Higher education | Civil marriage | House / apartment | Y | N | N | Managers | 1 | None | Two | 40 | 13.00 |
| 2 | F | N | Y | 27000.00 | Working | Secondary / secondary special | Married | House / apartment | Y | N | N | Security staff | 0 | None | Two | 52 | 3.00 |
| 3 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
| 4 | F | Y | Y | 29250.00 | Pensioner | Secondary / secondary special | Married | House / apartment | N | N | N | Retired | 0 | None | Two | 54 | 0.00 |
from sklearn.preprocessing import LabelEncoder
# code_gender
encoding_df.code_gender.unique()
['F', 'M'] Categories (2, object): ['F', 'M']
encoding_df.code_gender.value_counts()
F 24430 M 12027 Name: code_gender, dtype: int64
le_code_gender = LabelEncoder()
encoding_df['code_gender'] = le_code_gender.fit_transform(encoding_df['code_gender'])
encoding_df['code_gender'].unique()
array([0, 1])
encoding_df.code_gender.value_counts()
0 24430 1 12027 Name: code_gender, dtype: int64
# flag_own_car
encoding_df.flag_own_car.unique()
['N', 'Y'] Categories (2, object): ['N', 'Y']
encoding_df.flag_own_car.value_counts()
N 22614 Y 13843 Name: flag_own_car, dtype: int64
le_flag_own_car = LabelEncoder()
encoding_df['flag_own_car'] = le_flag_own_car.fit_transform(encoding_df['flag_own_car'])
encoding_df['flag_own_car'].unique()
array([0, 1])
encoding_df.flag_own_car.value_counts()
0 22614 1 13843 Name: flag_own_car, dtype: int64
# flag_own_realty
encoding_df.flag_own_realty.unique()
['N', 'Y'] Categories (2, object): ['N', 'Y']
encoding_df.flag_own_realty.value_counts()
Y 24506 N 11951 Name: flag_own_realty, dtype: int64
le_flag_own_realty = LabelEncoder()
encoding_df['flag_own_realty'] = le_flag_own_realty.fit_transform(encoding_df['flag_own_realty'])
encoding_df['flag_own_realty'].unique()
array([0, 1])
encoding_df.flag_own_realty.value_counts()
1 24506 0 11951 Name: flag_own_realty, dtype: int64
# name_income_type
encoding_df.name_income_type.unique()
array(['Working', 'Pensioner', 'Commercial associate', 'State servant',
'Student'], dtype=object)
encoding_df.name_income_type.value_counts()
Working 18819 Commercial associate 8490 Pensioner 6152 State servant 2985 Student 11 Name: name_income_type, dtype: int64
le_name_income_type = LabelEncoder()
encoding_df['name_income_type'] = le_name_income_type.fit_transform(encoding_df['name_income_type'])
encoding_df['name_income_type'].unique()
array([4, 1, 0, 2, 3])
encoding_df.name_income_type.value_counts()
4 18819 0 8490 1 6152 2 2985 3 11 Name: name_income_type, dtype: int64
# name_education_type
encoding_df.name_education_type.unique()
array(['Higher education', 'Secondary / secondary special',
'Lower secondary', 'Incomplete higher', 'Academic degree'],
dtype=object)
encoding_df.name_education_type.value_counts()
Secondary / secondary special 24777 Higher education 9864 Incomplete higher 1410 Lower secondary 374 Academic degree 32 Name: name_education_type, dtype: int64
le_name_education_type = LabelEncoder()
encoding_df['name_education_type'] = le_name_education_type.fit_transform(encoding_df['name_education_type'])
encoding_df['name_education_type'].unique()
array([1, 4, 3, 2, 0])
encoding_df.name_education_type.value_counts()
4 24777 1 9864 2 1410 3 374 0 32 Name: name_education_type, dtype: int64
# name_family_status
encoding_df.name_family_status.unique()
array(['Civil marriage', 'Married', 'Separated', 'Single / not married',
'Widow'], dtype=object)
encoding_df.name_family_status.value_counts()
Married 25048 Single / not married 4829 Civil marriage 2945 Separated 2103 Widow 1532 Name: name_family_status, dtype: int64
le_name_family_status = LabelEncoder()
encoding_df['name_family_status'] = le_name_family_status.fit_transform(encoding_df['name_family_status'])
encoding_df['name_family_status'].unique()
array([0, 1, 2, 3, 4])
encoding_df.name_family_status.value_counts()
1 25048 3 4829 0 2945 2 2103 4 1532 Name: name_family_status, dtype: int64
# name_housing_type
encoding_df.name_housing_type.unique()
array(['House / apartment', 'Office apartment', 'Rented apartment',
'Municipal apartment', 'With parents', 'Co-op apartment'],
dtype=object)
encoding_df.name_housing_type.value_counts()
House / apartment 32548 With parents 1776 Municipal apartment 1128 Rented apartment 575 Office apartment 262 Co-op apartment 168 Name: name_housing_type, dtype: int64
le_name_housing_type = LabelEncoder()
encoding_df['name_housing_type'] = le_name_housing_type.fit_transform(encoding_df['name_housing_type'])
encoding_df['name_housing_type'].unique()
array([1, 3, 4, 2, 5, 0])
encoding_df.name_housing_type.value_counts()
1 32548 5 1776 2 1128 4 575 3 262 0 168 Name: name_housing_type, dtype: int64
# occupation_type
encoding_df.occupation_type.unique()
array(['Managers', 'Security staff', 'Retired', 'Cleaning staff',
'Core staff', 'Laborers', 'Others', 'Cooking staff',
'Medicine staff', 'Accountants', 'Sales staff',
'Low-skill Laborers', 'High skill tech staff', 'Drivers',
'Secretaries', 'Waiters/barmen staff', 'Private service staff',
'Realty agents', 'IT staff', 'HR staff'], dtype=object)
encoding_df.occupation_type.value_counts()
Laborers 6210 Retired 6136 Others 5188 Core staff 3591 Sales staff 3485 Managers 3012 Drivers 2138 High skill tech staff 1383 Accountants 1241 Medicine staff 1207 Cooking staff 655 Security staff 592 Cleaning staff 551 Private service staff 344 Low-skill Laborers 175 Waiters/barmen staff 174 Secretaries 151 HR staff 85 Realty agents 79 IT staff 60 Name: occupation_type, dtype: int64
le_occupation_type = LabelEncoder()
encoding_df['occupation_type'] = le_occupation_type.fit_transform(encoding_df['occupation_type'])
encoding_df['occupation_type'].unique()
array([10, 18, 15, 1, 3, 8, 12, 2, 11, 0, 16, 9, 6, 4, 17, 19, 13,
14, 7, 5])
encoding_df.occupation_type.value_counts()
8 6210 15 6136 12 5188 3 3591 16 3485 10 3012 4 2138 6 1383 0 1241 11 1207 2 655 18 592 1 551 13 344 9 175 19 174 17 151 5 85 14 79 7 60 Name: occupation_type, dtype: int64
# flag_work_phone
encoding_df.flag_work_phone.unique()
['Y', 'N'] Categories (2, object): ['N', 'Y']
encoding_df.flag_work_phone.value_counts()
N 28235 Y 8222 Name: flag_work_phone, dtype: int64
le_flag_work_phone = LabelEncoder()
encoding_df['flag_work_phone'] = le_flag_work_phone.fit_transform(encoding_df['flag_work_phone'])
encoding_df['flag_work_phone'].unique()
array([1, 0])
encoding_df.flag_work_phone.value_counts()
0 28235 1 8222 Name: flag_work_phone, dtype: int64
# flag_phone
encoding_df.flag_phone.unique()
['N', 'Y'] Categories (2, object): ['N', 'Y']
encoding_df.flag_phone.value_counts()
N 25709 Y 10748 Name: flag_phone, dtype: int64
le_flag_phone = LabelEncoder()
encoding_df['flag_phone'] = le_flag_phone.fit_transform(encoding_df['flag_phone'])
encoding_df['flag_phone'].unique()
array([0, 1])
encoding_df.flag_phone.value_counts()
0 25709 1 10748 Name: flag_phone, dtype: int64
# flag_email
encoding_df.flag_email.unique()
['N', 'Y'] Categories (2, object): ['N', 'Y']
encoding_df.flag_email.value_counts()
N 33186 Y 3271 Name: flag_email, dtype: int64
le_flag_email = LabelEncoder()
encoding_df['flag_email'] = le_flag_email.fit_transform(encoding_df['flag_email'])
encoding_df['flag_email'].unique()
array([0, 1])
encoding_df.flag_email.value_counts()
0 33186 1 3271 Name: flag_email, dtype: int64
# children_cnt_bucket
encoding_df.children_cnt_bucket.unique()
array(['None', 'One', 'Three', 'Two', 'Four', 'Five', 'More than Five'],
dtype=object)
encoding_df.children_cnt_bucket.value_counts()
None 25201 One 7492 Two 3256 Three 419 Four 63 Five 20 More than Five 6 Name: children_cnt_bucket, dtype: int64
le_children_cnt_bucket = LabelEncoder()
encoding_df['children_cnt_bucket'] = le_children_cnt_bucket.fit_transform(encoding_df['children_cnt_bucket'])
encoding_df['children_cnt_bucket'].unique()
array([3, 4, 5, 6, 1, 0, 2])
encoding_df.children_cnt_bucket.value_counts()
3 25201 4 7492 6 3256 5 419 1 63 0 20 2 6 Name: children_cnt_bucket, dtype: int64
# cnt_fam_members_bucket
encoding_df.cnt_fam_members_bucket.unique()
array(['Two', 'Three', 'One', 'Five', 'Four', 'Six', 'Seven',
'More than Seven'], dtype=object)
encoding_df.cnt_fam_members_bucket.value_counts()
Two 19463 One 6987 Three 6421 Four 3106 Five 397 Six 58 Seven 19 More than Seven 6 Name: cnt_fam_members_bucket, dtype: int64
le_cnt_fam_members_bucket = LabelEncoder()
encoding_df['cnt_fam_members_bucket'] = le_cnt_fam_members_bucket.fit_transform(encoding_df['cnt_fam_members_bucket'])
encoding_df['cnt_fam_members_bucket'].unique()
array([7, 6, 3, 0, 1, 5, 4, 2])
encoding_df.cnt_fam_members_bucket.value_counts()
7 19463 3 6987 6 6421 1 3106 0 397 5 58 4 19 2 6 Name: cnt_fam_members_bucket, dtype: int64
encoding_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null int32 1 flag_own_car 36457 non-null int32 2 flag_own_realty 36457 non-null int32 3 amt_income_total 36457 non-null float64 4 name_income_type 36457 non-null int32 5 name_education_type 36457 non-null int32 6 name_family_status 36457 non-null int32 7 name_housing_type 36457 non-null int32 8 flag_work_phone 36457 non-null int32 9 flag_phone 36457 non-null int32 10 flag_email 36457 non-null int32 11 occupation_type 36457 non-null int32 12 status 36457 non-null int64 13 children_cnt_bucket 36457 non-null int32 14 cnt_fam_members_bucket 36457 non-null int32 15 age 36457 non-null int32 16 employed_years 36457 non-null float64 dtypes: float64(2), int32(14), int64(1) memory usage: 2.8 MB
# Correlation of cleaned dataset encoding_df after Label Encoder
plt.figure(figsize = (20, 20), dpi = 80, facecolor = 'white', edgecolor = 'k')
sns.set(font_scale = 2)
hm_corr2 = sns.heatmap(encoding_df.corr(), annot = True, vmin = -1, vmax = 1, cmap = 'coolwarm', fmt = '.2f',
cbar_kws = {"shrink": .82, 'label': 'Correlation %'},
annot_kws = {"size": 18}, linewidths = 0.1, linecolor = 'white', square = True)
plt.title('Correlation matrix of Encoded Data (encoding_df)\n')
hm_corr2.set(xlabel = '\nApplicants Details', ylabel = 'Applicants Details\n')
hm_corr2.set_xticklabels(hm_corr2.get_xmajorticklabels(), fontsize = 12, rotation = 45)
hm_corr2.set_yticklabels(hm_corr2.get_ymajorticklabels(), fontsize = 12)
plt.savefig('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\corr_matrix_eda3.jpg')
plt.show()
from statsmodels.stats.outliers_influence import variance_inflation_factor
encoding_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | children_cnt_bucket | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 3 | 7 | 40 | 13.00 |
| 1 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 3 | 7 | 40 | 13.00 |
| 2 | 0 | 0 | 1 | 27000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 18 | 0 | 3 | 7 | 52 | 3.00 |
| 3 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 3 | 7 | 54 | 0.00 |
| 4 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 3 | 7 | 54 | 0.00 |
vif_data = pd.DataFrame()
vif_data["Columns"] = encoding_df.columns
vif_data["VIF"] = [variance_inflation_factor(encoding_df.values, i)
for i in range(len(encoding_df.columns))]
vif_data.sort_values('VIF', ascending = False)
| Columns | VIF | |
|---|---|---|
| 15 | age | 17.48 |
| 13 | children_cnt_bucket | 11.47 |
| 14 | cnt_fam_members_bucket | 9.03 |
| 5 | name_education_type | 6.86 |
| 11 | occupation_type | 5.83 |
| 3 | amt_income_total | 4.93 |
| 6 | name_family_status | 3.63 |
| 2 | flag_own_realty | 3.29 |
| 4 | name_income_type | 3.26 |
| 7 | name_housing_type | 2.91 |
| 16 | employed_years | 1.97 |
| 1 | flag_own_car | 1.95 |
| 0 | code_gender | 1.84 |
| 9 | flag_phone | 1.59 |
| 8 | flag_work_phone | 1.57 |
| 12 | status | 1.14 |
| 10 | flag_email | 1.13 |
encoding_df = encoding_df.drop(['children_cnt_bucket'], axis = 1)
encoding_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 1 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 2 | 0 | 0 | 1 | 27000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 18 | 0 | 7 | 52 | 3.00 |
| 3 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
| 4 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
encoding_df.shape
(36457, 16)
encoding_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null int32 1 flag_own_car 36457 non-null int32 2 flag_own_realty 36457 non-null int32 3 amt_income_total 36457 non-null float64 4 name_income_type 36457 non-null int32 5 name_education_type 36457 non-null int32 6 name_family_status 36457 non-null int32 7 name_housing_type 36457 non-null int32 8 flag_work_phone 36457 non-null int32 9 flag_phone 36457 non-null int32 10 flag_email 36457 non-null int32 11 occupation_type 36457 non-null int32 12 status 36457 non-null int64 13 cnt_fam_members_bucket 36457 non-null int32 14 age 36457 non-null int32 15 employed_years 36457 non-null float64 dtypes: float64(2), int32(13), int64(1) memory usage: 2.6 MB
# Recheck the VIF of the dataset
vif_data2 = pd.DataFrame()
vif_data2["Columns"] = encoding_df.columns
vif_data2["VIF"] = [variance_inflation_factor(encoding_df.values, i)
for i in range(len(encoding_df.columns))]
vif_data2.sort_values('VIF', ascending = False)
| Columns | VIF | |
|---|---|---|
| 14 | age | 16.23 |
| 13 | cnt_fam_members_bucket | 8.46 |
| 5 | name_education_type | 6.51 |
| 11 | occupation_type | 5.63 |
| 3 | amt_income_total | 4.58 |
| 6 | name_family_status | 3.60 |
| 2 | flag_own_realty | 3.17 |
| 4 | name_income_type | 3.03 |
| 7 | name_housing_type | 2.70 |
| 15 | employed_years | 1.95 |
| 1 | flag_own_car | 1.91 |
| 0 | code_gender | 1.83 |
| 9 | flag_phone | 1.59 |
| 8 | flag_work_phone | 1.54 |
| 12 | status | 1.13 |
| 10 | flag_email | 1.12 |
encoding_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 1 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 2 | 0 | 0 | 1 | 27000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 18 | 0 | 7 | 52 | 3.00 |
| 3 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
| 4 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
# Correlation of cleaned dataset encoding_df after Label Encoder
plt.figure(figsize = (20, 20), dpi = 80, facecolor = 'white', edgecolor = 'k')
sns.set(font_scale = 2)
hm_corr3 = sns.heatmap(encoding_df.corr(), annot = True, vmin = -1, vmax = 1, cmap = 'coolwarm', fmt = '.2f',
cbar_kws = {"shrink": .82, 'label': 'Correlation %'},
annot_kws = {"size": 18}, linewidths = 0.1, linecolor = 'white', square = True)
plt.title('Correlation matrix of Encoded Data (encoding_df)\n')
hm_corr3.set(xlabel = '\nApplicants Details', ylabel = 'Applicants Details\n')
hm_corr3.set_xticklabels(hm_corr3.get_xmajorticklabels(), fontsize = 12, rotation = 45)
hm_corr3.set_yticklabels(hm_corr3.get_ymajorticklabels(), fontsize = 12)
plt.savefig('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\corr_matrix_vif4.jpg')
plt.show()
import statsmodels.api as sm
p_value_df = encoding_df.copy()
p_value_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 1 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 2 | 0 | 0 | 1 | 27000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 18 | 0 | 7 | 52 | 3.00 |
| 3 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
| 4 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
X_p_value_df = p_value_df.drop(['status'], axis = 1)
y_p_value_df = p_value_df['status']
X_p_value_df = sm.add_constant(X_p_value_df)
model_demo = sm.OLS(y_p_value_df, X_p_value_df)
# model_demo = sm.Logit(y_demo, X_demo)
results = model_demo.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: status R-squared: 0.004
Model: OLS Adj. R-squared: 0.003
Method: Least Squares F-statistic: 8.735
Date: Sat, 24 Sep 2022 Prob (F-statistic): 1.47e-20
Time: 18:28:17 Log-Likelihood: -10380.
No. Observations: 36457 AIC: 2.079e+04
Df Residuals: 36441 BIC: 2.093e+04
Df Model: 15
Covariance Type: nonrobust
==========================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------
const 0.1642 0.012 13.385 0.000 0.140 0.188
code_gender 0.0142 0.004 3.565 0.000 0.006 0.022
flag_own_car -0.0181 0.004 -4.741 0.000 -0.026 -0.011
flag_own_realty -0.0190 0.004 -5.074 0.000 -0.026 -0.012
amt_income_total 7.209e-08 1.78e-08 4.048 0.000 3.72e-08 1.07e-07
name_income_type -0.0027 0.001 -2.642 0.008 -0.005 -0.001
name_education_type 0.0017 0.001 1.270 0.204 -0.001 0.004
name_family_status -0.0009 0.002 -0.441 0.659 -0.005 0.003
name_housing_type 0.0008 0.002 0.419 0.675 -0.003 0.004
flag_work_phone -0.0057 0.004 -1.266 0.205 -0.014 0.003
flag_phone -0.0038 0.004 -0.977 0.328 -0.012 0.004
flag_email 0.0191 0.006 3.195 0.001 0.007 0.031
occupation_type -0.0008 0.000 -2.178 0.029 -0.002 -8.16e-05
cnt_fam_members_bucket -0.0017 0.001 -1.843 0.065 -0.003 0.000
age -0.0006 0.000 -3.622 0.000 -0.001 -0.000
employed_years 0.0002 0.000 0.694 0.487 -0.000 0.001
==============================================================================
Omnibus: 15938.859 Durbin-Watson: 1.733
Prob(Omnibus): 0.000 Jarque-Bera (JB): 53603.753
Skew: 2.360 Prob(JB): 0.00
Kurtosis: 6.606 Cond. No. 1.57e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.57e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
model_df = encoding_df.copy()
model_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | status | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 1 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 1 | 7 | 40 | 13.00 |
| 2 | 0 | 0 | 1 | 27000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 18 | 0 | 7 | 52 | 3.00 |
| 3 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
| 4 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 0 | 7 | 54 | 0.00 |
model_df.shape
(36457, 16)
# Save the Dataset for model building
model_df.to_csv('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\model_dataset.csv', index = False)
# X value contains all the variables except status (target variable)
X = model_df.drop(['status'], axis = 1)
X.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 7 | 40 | 13.00 |
| 1 | 0 | 0 | 0 | 27000.00 | 4 | 1 | 0 | 1 | 1 | 0 | 0 | 10 | 7 | 40 | 13.00 |
| 2 | 0 | 0 | 1 | 27000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 18 | 7 | 52 | 3.00 |
| 3 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 7 | 54 | 0.00 |
| 4 | 0 | 1 | 1 | 29250.00 | 1 | 4 | 1 | 1 | 0 | 0 | 0 | 15 | 7 | 54 | 0.00 |
X.shape
(36457, 15)
# y contains only status (target variable)
y = model_df['status']
y.head()
0 1 1 1 2 0 3 0 4 0 Name: status, dtype: int64
y.shape
(36457,)
# We create the test train split first
from sklearn.model_selection import train_test_split
X_balanced, X_test_balanced, y_balanced, y_test_balanced = train_test_split(X , y, test_size = 0.3, random_state = 42, stratify = y)
encoding_df.status.value_counts() / encoding_df.shape[0]
0 0.88 1 0.12 Name: status, dtype: float64
y_balanced.value_counts() / len(y_balanced)
0 0.88 1 0.12 Name: status, dtype: float64
y_test_balanced.value_counts() / len(y_test_balanced)
0 0.88 1 0.12 Name: status, dtype: float64
X_balanced.shape
(25519, 15)
y_balanced.shape
(25519,)
X_test_balanced.shape
(10938, 15)
y_test_balanced.shape
(10938,)
X_balanced.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11957 | 1 | 1 | 1 | 135000.00 | 4 | 4 | 1 | 1 | 0 | 1 | 0 | 8 | 7 | 39 | 9.00 |
| 11052 | 1 | 0 | 1 | 135000.00 | 4 | 4 | 0 | 1 | 0 | 0 | 0 | 12 | 6 | 36 | 7.00 |
| 13960 | 1 | 0 | 0 | 135000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 4 | 7 | 40 | 9.00 |
| 5372 | 0 | 0 | 1 | 103500.00 | 0 | 4 | 1 | 1 | 1 | 1 | 0 | 16 | 7 | 46 | 4.00 |
| 30992 | 1 | 1 | 0 | 270000.00 | 0 | 4 | 1 | 1 | 1 | 1 | 0 | 10 | 7 | 56 | 7.00 |
y_balanced.head()
11957 0 11052 0 13960 0 5372 0 30992 1 Name: status, dtype: int64
y_balanced.value_counts()
0 22515 1 3004 Name: status, dtype: int64
X_test_balanced.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36245 | 0 | 0 | 1 | 675000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 12 | 7 | 49 | 18.00 |
| 30510 | 0 | 0 | 1 | 270000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 0 | 6 | 7 | 49 | 27.00 |
| 7249 | 0 | 0 | 1 | 112500.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 16 | 7 | 42 | 6.00 |
| 22707 | 1 | 0 | 0 | 189000.00 | 2 | 4 | 1 | 1 | 0 | 0 | 0 | 3 | 7 | 43 | 7.00 |
| 18346 | 0 | 0 | 0 | 157500.00 | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 11 | 3 | 47 | 8.00 |
y_test_balanced.head()
36245 0 30510 0 7249 0 22707 0 18346 0 Name: status, dtype: int64
y_test_balanced.value_counts()
0 9651 1 1287 Name: status, dtype: int64
Insights:-
# Import the model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
classifiers = {
"LogisticRegression" : LogisticRegression(),
"KNeighbors" : KNeighborsClassifier(),
"DecisionTree" : DecisionTreeClassifier(),
"RandomForest" : RandomForestClassifier(),
"XGBoost" : XGBClassifier(),
"GradientBoostingClassifier" : GradientBoostingClassifier()
}
scores_dict = {}
for key, classifier in classifiers.items():
classifier.fit(X_balanced, y_balanced)
train_score = classifier.score(X_balanced, y_balanced)
test_score = classifier.score(X_test_balanced, y_test_balanced)
scores_dict[key] = {"Train Score" : train_score, "Test Score" : test_score}
for key, value in scores_dict.items():
print("\n{} :".format(key))
for key1, value1 in value.items():
print("\t{}\t : {}".format(key1, value1))
LogisticRegression : Train Score : 0.882283788549708 Test Score : 0.8823368074602304 KNeighbors : Train Score : 0.9003487597476391 Test Score : 0.8722801243371732 DecisionTree : Train Score : 0.9336572749715898 Test Score : 0.8762113731943683 RandomForest : Train Score : 0.9336572749715898 Test Score : 0.8819711098921192 XGBoost : Train Score : 0.8987029272306909 Test Score : 0.8869994514536478 GradientBoostingClassifier : Train Score : 0.8826364669461969 Test Score : 0.882519656244286
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
#xgb = XGBClassifier()
xgb = XGBClassifier()
#model = xgb.fit(X_balanced, y_balanced)
xgb.fit(X_balanced, y_balanced)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, ...)# Check the probability of the Eligible applicants
xgb.predict_proba(X_test_balanced)
array([[0.7641203 , 0.23587973],
[0.94811034, 0.05188968],
[0.93724144, 0.06275856],
...,
[0.87145746, 0.12854256],
[0.94180006, 0.05819992],
[0.6944636 , 0.30553636]], dtype=float32)
# Predict the eligibility of the applicants
xgb_pred = xgb.predict(X_test_balanced)
#print(prediction)
xgb_pred
array([0, 0, 0, ..., 0, 0, 0])
xgb_pred.shape
(10938,)
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
print(classification_report(y_test_balanced, xgb_pred))
precision recall f1-score support
0 0.89 0.99 0.94 9651
1 0.61 0.11 0.19 1287
accuracy 0.89 10938
macro avg 0.75 0.55 0.56 10938
weighted avg 0.86 0.89 0.85 10938
Accuracy = metrics.accuracy_score(y_test_balanced, xgb_pred)
Precision = metrics.precision_score(y_test_balanced, xgb_pred)
Sensitivity_recall = metrics.recall_score(y_test_balanced, xgb_pred)
Specificity = metrics.recall_score(y_test_balanced, xgb_pred, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, xgb_pred)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8869994514536478, 'Precision': 0.6075949367088608, 'Sensitivity_recall': 0.11188811188811189, 'Specificity': 0.9903636928815667, 'F1_score': 0.1889763779527559}
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, xgb_pred))
print("RMSE : % f" %(rmse))
RMSE : 0.336156
# Accuracy Score
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test_balanced, xgb_pred)))
Accuracy Score is 0.887
# Confusion Matrix Chart
confusion_matrix = metrics.confusion_matrix(y_test_balanced, xgb_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(XGBoostClassifier)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, xgb_pred)
auc = metrics.roc_auc_score(y_test_balanced, xgb_pred)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(XGBoostClassifier)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# Find the feature importance
importances = pd.DataFrame(data = {
'Attribute': X_test_balanced.columns,
'Importance': xgb.feature_importances_
})
importances = importances.sort_values(by = 'Importance', ascending = False)
importances
| Attribute | Importance | |
|---|---|---|
| 10 | flag_email | 0.08 |
| 6 | name_family_status | 0.07 |
| 11 | occupation_type | 0.07 |
| 4 | name_income_type | 0.07 |
| 5 | name_education_type | 0.07 |
| 14 | employed_years | 0.07 |
| 0 | code_gender | 0.07 |
| 12 | cnt_fam_members_bucket | 0.07 |
| 13 | age | 0.07 |
| 3 | amt_income_total | 0.07 |
| 1 | flag_own_car | 0.07 |
| 8 | flag_work_phone | 0.06 |
| 9 | flag_phone | 0.06 |
| 2 | flag_own_realty | 0.06 |
| 7 | name_housing_type | 0.06 |
# Visually plot the feature importances
plt.bar(x = importances['Attribute'], height = importances['Importance'], color = '#087E8B')
plt.title('Feature Importances obtained from coefficients - XGBoostClassifier', size = 20)
plt.xticks(rotation = 'vertical')
plt.grid(False)
plt.show()
classifierLR = LogisticRegression(random_state = 42)
classifierLR.fit(X_balanced, y_balanced)
LogisticRegression(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(random_state=42)
classifierLR.classes_
array([0, 1], dtype=int64)
classifierLR.intercept_
array([-0.00079533])
classifierLR.coef_
array([[-4.49477424e-05, -1.46305928e-04, -5.86505862e-04,
-1.10170730e-06, -2.06135467e-03, -2.94275601e-03,
-1.04725840e-03, -9.40760603e-04, -2.14072027e-04,
-2.43595801e-04, 5.82821780e-06, -9.34892576e-03,
-4.56880447e-03, -3.80768034e-02, -3.30012017e-03]])
pred_prob = classifierLR.predict_proba(X_test_balanced)
pred_prob
array([[0.94460969, 0.05539031],
[0.91402403, 0.08597597],
[0.87521895, 0.12478105],
...,
[0.90400326, 0.09599674],
[0.94300127, 0.05699873],
[0.79473563, 0.20526437]])
In the matrix above, each row corresponds to a single observation. The first column is the probability of the predicted output being zero, that is 1 - 𝑝(𝑥). The second column is the probability that the output is one, or 𝑝(𝑥).
y_predLR = classifierLR.predict(X_test_balanced)
y_predLR
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
y_predLR.shape
(10938,)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test_balanced, y_predLR)))
Accuracy Score is 0.88234
# Confusion Matrix Chart
confusion_matrix = metrics.confusion_matrix(y_test_balanced, y_predLR)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(LogisticRegression)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
The Confusion Matrix created has four different quadrants:
True means that the values were accurately predicted, False means that there was an error or wrong prediction.
print(classification_report(y_test_balanced, y_predLR))
precision recall f1-score support
0 0.88 1.00 0.94 9651
1 0.00 0.00 0.00 1287
accuracy 0.88 10938
macro avg 0.44 0.50 0.47 10938
weighted avg 0.78 0.88 0.83 10938
Accuracy = metrics.accuracy_score(y_test_balanced, y_predLR)
Precision = metrics.precision_score(y_test_balanced, y_predLR)
Sensitivity_recall = metrics.recall_score(y_test_balanced, y_predLR)
Specificity = metrics.recall_score(y_test_balanced, y_predLR, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, y_predLR)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8823368074602304, 'Precision': 0.0, 'Sensitivity_recall': 0.0, 'Specificity': 1.0, 'F1_score': 0.0}
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, y_predLR)
auc = metrics.roc_auc_score(y_test_balanced, y_predLR)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(LogisticRegression)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, y_predLR))
print("RMSE : % f" %(rmse))
RMSE : 0.343021
# Find the value of k
import matplotlib.pyplot as plt
from matplotlib import style
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
cost =[]
for i in range(1, 11):
KM = KMeans(n_clusters = i, max_iter = 500)
KM.fit(X_test_balanced)
# calculates squared error
# for the clustered points
cost.append(KM.inertia_)
# plot the cost against K values
plt.plot(range(1, 11), cost, color ='g', linewidth ='3')
plt.xlabel("Value of K")
plt.ylabel("Squared Error (Cost)")
plt.show()
In this case the optimal value for k would be 4. (the last elbow bend)
# Apply the k value in the model
knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(X_balanced, y_balanced)
KNeighborsClassifier(n_neighbors=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=4)
knn_pred_prob = knn.predict_proba(X_test_balanced)
knn_pred_prob
array([[0.75, 0.25],
[1. , 0. ],
[1. , 0. ],
...,
[0.75, 0.25],
[1. , 0. ],
[0.5 , 0.5 ]])
knn_pred = knn.predict(X_test_balanced)
knn_pred
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
knn_pred.shape
(10938,)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test_balanced, knn_pred)))
Accuracy Score is 0.8806
# Confusion Matrix Chart
confusion_matrix = metrics.confusion_matrix(y_test_balanced, knn_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(KNeighborsClassifier)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
print(classification_report(y_test_balanced, knn_pred))
precision recall f1-score support
0 0.90 0.98 0.94 9651
1 0.48 0.15 0.23 1287
accuracy 0.88 10938
macro avg 0.69 0.57 0.58 10938
weighted avg 0.85 0.88 0.85 10938
Accuracy = metrics.accuracy_score(y_test_balanced, knn_pred)
Precision = metrics.precision_score(y_test_balanced, knn_pred)
Sensitivity_recall = metrics.recall_score(y_test_balanced, knn_pred)
Specificity = metrics.recall_score(y_test_balanced, knn_pred, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, knn_pred)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8805997440117023, 'Precision': 0.4771084337349398, 'Sensitivity_recall': 0.15384615384615385, 'Specificity': 0.9775152833903222, 'F1_score': 0.23266745005875444}
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, knn_pred)
auc = metrics.roc_auc_score(y_test_balanced, knn_pred)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(KNeighborsClassifier)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, knn_pred))
print("RMSE : % f" %(rmse))
RMSE : 0.345543
# Create Decision Tree classifer object
clf_dt = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf_dt = clf_dt.fit(X_balanced, y_balanced)
#Predict probabilities for test dataset
clf_dt.predict_proba(X_test_balanced)
array([[0.75, 0.25],
[1. , 0. ],
[1. , 0. ],
...,
[0.8 , 0.2 ],
[1. , 0. ],
[0. , 1. ]])
#Predict the response for test dataset
y_pred_dt = clf_dt.predict(X_test_balanced)
y_pred_dt
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
# Model Accuracy, how often is the classifier correct?
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test_balanced, y_pred_dt)))
Accuracy Score is 0.8763
# Confusion Matrix Chart
confusion_matrix = metrics.confusion_matrix(y_test_balanced, y_pred_dt)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(DecisionTreeClassifier)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
print(classification_report(y_test_balanced, y_pred_dt))
precision recall f1-score support
0 0.91 0.95 0.93 9651
1 0.46 0.30 0.36 1287
accuracy 0.88 10938
macro avg 0.69 0.63 0.65 10938
weighted avg 0.86 0.88 0.86 10938
Accuracy = metrics.accuracy_score(y_test_balanced, y_pred_dt)
Precision = metrics.precision_score(y_test_balanced, y_pred_dt)
Sensitivity_recall = metrics.recall_score(y_test_balanced, y_pred_dt)
Specificity = metrics.recall_score(y_test_balanced, y_pred_dt, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, y_pred_dt)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8763027975863961, 'Precision': 0.4607142857142857, 'Sensitivity_recall': 0.3006993006993007, 'Specificity': 0.953061858874728, 'F1_score': 0.36389280677009866}
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, y_pred_dt)
auc = metrics.roc_auc_score(y_test_balanced, y_pred_dt)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(DecisionTreeClassifier)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, y_pred_dt))
print("RMSE : % f" %(rmse))
RMSE : 0.351706
# Decision Tree - Gini
clf_dt_gini = DecisionTreeClassifier(criterion = "gini", max_depth = 3, min_samples_leaf = 5, random_state = 42)
clf_dt_gini = clf_dt_gini.fit(X_balanced, y_balanced)
clf_dt_gini.predict_proba(X_test_balanced)
array([[0.89387755, 0.10612245],
[0.86319153, 0.13680847],
[0.9057356 , 0.0942644 ],
...,
[0.87664042, 0.12335958],
[0.89387755, 0.10612245],
[0.87736173, 0.12263827]])
y_pred_dt_gini = clf_dt.predict(X_test_balanced)
y_pred_dt_gini
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
# Model Accuracy, how often is the classifier correct?
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test_balanced, y_pred_dt_gini)))
Accuracy Score is 0.8763
# Confusion Matrix Chart
confusion_matrix = metrics.confusion_matrix(y_test_balanced, y_pred_dt_gini)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(DecisionTreeClassifier - Gini)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
print(classification_report(y_test_balanced, y_pred_dt_gini))
precision recall f1-score support
0 0.91 0.95 0.93 9651
1 0.46 0.30 0.36 1287
accuracy 0.88 10938
macro avg 0.69 0.63 0.65 10938
weighted avg 0.86 0.88 0.86 10938
Accuracy = metrics.accuracy_score(y_test_balanced, y_pred_dt_gini)
Precision = metrics.precision_score(y_test_balanced, y_pred_dt_gini)
Sensitivity_recall = metrics.recall_score(y_test_balanced, y_pred_dt_gini)
Specificity = metrics.recall_score(y_test_balanced, y_pred_dt_gini, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, y_pred_dt_gini)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8763027975863961, 'Precision': 0.4607142857142857, 'Sensitivity_recall': 0.3006993006993007, 'Specificity': 0.953061858874728, 'F1_score': 0.36389280677009866}
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, y_pred_dt_gini)
auc = metrics.roc_auc_score(y_test_balanced, y_pred_dt_gini)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(DecisionTreeClassifier - Gini)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, y_pred_dt_gini))
print("RMSE : % f" %(rmse))
RMSE : 0.351706
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dt_gini, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = X_test_balanced.columns,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('decision_tree_gini.png')
Image(graph.create_png())
# Decision Tree - Entropy
clf_dt_ent = DecisionTreeClassifier(criterion = "entropy", max_depth = 3, min_samples_leaf = 5, random_state = 42)
clf_dt_ent = clf_dt_ent.fit(X_balanced, y_balanced)
clf_dt_ent.predict_proba(X_test_balanced)
array([[0.90344453, 0.09655547],
[0.86371412, 0.13628588],
[0.90344453, 0.09655547],
...,
[0.87728879, 0.12271121],
[0.86371412, 0.13628588],
[0.87728879, 0.12271121]])
y_pred_dt_ent = clf_dt.predict(X_test_balanced)
y_pred_dt_ent
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
# Model Accuracy, how often is the classifier correct?
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test_balanced, y_pred_dt_ent)))
Accuracy Score is 0.8763
# Confusion Matrix Chart
confusion_matrix = metrics.confusion_matrix(y_test_balanced, y_pred_dt_ent)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(DecisionTreeClassifier - Entropy)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
print(classification_report(y_test_balanced, y_pred_dt_ent))
precision recall f1-score support
0 0.91 0.95 0.93 9651
1 0.46 0.30 0.36 1287
accuracy 0.88 10938
macro avg 0.69 0.63 0.65 10938
weighted avg 0.86 0.88 0.86 10938
Accuracy = metrics.accuracy_score(y_test_balanced, y_pred_dt_ent)
Precision = metrics.precision_score(y_test_balanced, y_pred_dt_ent)
Sensitivity_recall = metrics.recall_score(y_test_balanced, y_pred_dt_ent)
Specificity = metrics.recall_score(y_test_balanced, y_pred_dt_ent, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, y_pred_dt_ent)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8763027975863961, 'Precision': 0.4607142857142857, 'Sensitivity_recall': 0.3006993006993007, 'Specificity': 0.953061858874728, 'F1_score': 0.36389280677009866}
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, y_pred_dt_ent)
auc = metrics.roc_auc_score(y_test_balanced, y_pred_dt_ent)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(DecisionTreeClassifier - Entropy)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, y_pred_dt_ent))
print("RMSE : % f" %(rmse))
RMSE : 0.351706
dot_data = StringIO()
export_graphviz(clf_dt_ent, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = X_test_balanced.columns,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('decision_tree_entropy.png')
Image(graph.create_png())
from sklearn.ensemble import RandomForestClassifier
# create regressor object
# clf_rf = RandomForestClassifier(n_estimators = 100, max_depth = 2, random_state = 42)
clf_rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
# fit the regressor with x and y data
clf_rf.fit(X_balanced, y_balanced)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
y_pred_rf = clf_rf.predict(X_test_balanced)
y_pred_rf
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
y_pred_rf.shape
(10938,)
# Calculate the accuracy of the model
print(clf_rf.score(X_test_balanced, y_test_balanced))
0.8814225635399524
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, y_pred_rf))
print("RMSE : % f" %(rmse))
RMSE : 0.344351
# Confusion Matrix Chart
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test_balanced, y_pred_rf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(RandomForestClassifier)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
print(classification_report(y_test_balanced, y_pred_rf))
precision recall f1-score support
0 0.91 0.96 0.93 9651
1 0.49 0.30 0.38 1287
accuracy 0.88 10938
macro avg 0.70 0.63 0.66 10938
weighted avg 0.86 0.88 0.87 10938
Accuracy = metrics.accuracy_score(y_test_balanced, y_pred_rf)
Precision = metrics.precision_score(y_test_balanced, y_pred_rf)
Sensitivity_recall = metrics.recall_score(y_test_balanced, y_pred_rf)
Specificity = metrics.recall_score(y_test_balanced, y_pred_rf, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, y_pred_rf)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8814225635399524, 'Precision': 0.4936708860759494, 'Sensitivity_recall': 0.30303030303030304, 'Specificity': 0.9585535177701793, 'F1_score': 0.3755416466056813}
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, y_pred_rf)
auc = metrics.roc_auc_score(y_test_balanced, y_pred_rf)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(RandomForestClassifier)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# Find best number of estimators using RandomForestClassifier
for w in range(10, 300, 20):
mod_rf = RandomForestClassifier(n_estimators = w, oob_score = True, n_jobs = 1, random_state = 42)
mod_rf.fit(X_balanced,y_balanced)
oob = mod_rf.oob_score_
print('For n_estimators = '+str(w))
print('oob score is '+str(oob))
print('*****************')
For n_estimators = 10 oob score is 0.8760923233669031 ***************** For n_estimators = 30 oob score is 0.8816568047337278 ***************** For n_estimators = 50 oob score is 0.8824405345037031 ***************** For n_estimators = 70 oob score is 0.8825580939691994 ***************** For n_estimators = 90 oob score is 0.8830675183196833 ***************** For n_estimators = 110 oob score is 0.8828323993886907 ***************** For n_estimators = 130 oob score is 0.8832634507621772 ***************** For n_estimators = 150 oob score is 0.8834985696931698 ***************** For n_estimators = 170 oob score is 0.8836553156471648 ***************** For n_estimators = 190 oob score is 0.8838120616011599 ***************** For n_estimators = 210 oob score is 0.8838120616011599 ***************** For n_estimators = 230 oob score is 0.883968807555155 ***************** For n_estimators = 250 oob score is 0.8840863670206512 ***************** For n_estimators = 270 oob score is 0.8838120616011599 ***************** For n_estimators = 290 oob score is 0.8837728751126611 *****************
for w in range(300, 600, 20):
mod_rf2 = RandomForestClassifier(n_estimators = w, oob_score = True, n_jobs = 1, random_state = 42)
mod_rf2.fit(X_balanced, y_balanced)
oob = mod_rf2.oob_score_
print('For n_estimators = '+str(w))
print('oob score is '+str(oob))
print('*****************')
For n_estimators = 300 oob score is 0.8838120616011599 ***************** For n_estimators = 320 oob score is 0.8839296210666562 ***************** For n_estimators = 340 oob score is 0.8840471805321525 ***************** For n_estimators = 360 oob score is 0.8842039264861475 ***************** For n_estimators = 380 oob score is 0.8843214859516438 ***************** For n_estimators = 400 oob score is 0.8840079940436537 ***************** For n_estimators = 420 oob score is 0.8840863670206512 ***************** For n_estimators = 440 oob score is 0.8842039264861475 ***************** For n_estimators = 460 oob score is 0.8838512480896586 ***************** For n_estimators = 480 oob score is 0.8840863670206512 ***************** For n_estimators = 500 oob score is 0.8840863670206512 ***************** For n_estimators = 520 oob score is 0.8840079940436537 ***************** For n_estimators = 540 oob score is 0.8841647399976488 ***************** For n_estimators = 560 oob score is 0.883968807555155 ***************** For n_estimators = 580 oob score is 0.8837728751126611 *****************
# Taking 380 as the correct number of estimators becasue it has the highest oob score as 0.884321
from sklearn.ensemble import RandomForestClassifier
# create regressor object
clf_rf4 = RandomForestClassifier(n_estimators = 380, random_state = 42)
# fit the regressor with x and y data
clf_rf4.fit(X_balanced, y_balanced)
RandomForestClassifier(n_estimators=380, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(n_estimators=380, random_state=42)
clf_rf4.predict_proba(X_test_balanced)
array([[0.75672067, 0.24327933],
[1. , 0. ],
[0.97535088, 0.02464912],
...,
[0.79329948, 0.20670052],
[0.99793233, 0.00206767],
[0.09219298, 0.90780702]])
Y_pred_rf4 = clf_rf4.predict(X_test_balanced)
Y_pred_rf4
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
Y_pred_rf4.shape
(10938,)
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, Y_pred_rf4))
print("RMSE : % f" %(rmse))
RMSE : 0.342220
# Calculate the accuracy of the model
print(clf_rf4.score(X_test_balanced, y_test_balanced))
0.8828853538123972
# Confusion Matrix Chart
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test_balanced, Y_pred_rf4)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(RandomForestClassifier - 380 estimators)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
print(classification_report(y_test_balanced, Y_pred_rf4))
precision recall f1-score support
0 0.91 0.96 0.94 9651
1 0.50 0.29 0.37 1287
accuracy 0.88 10938
macro avg 0.71 0.62 0.65 10938
weighted avg 0.86 0.88 0.87 10938
Accuracy = metrics.accuracy_score(y_test_balanced, Y_pred_rf4)
Precision = metrics.precision_score(y_test_balanced, Y_pred_rf4)
Sensitivity_recall = metrics.recall_score(y_test_balanced, Y_pred_rf4)
Specificity = metrics.recall_score(y_test_balanced, Y_pred_rf4, pos_label = 0)
F1_score = metrics.f1_score(y_test_balanced, Y_pred_rf4)
print({"Accuracy": Accuracy, "Precision": Precision, "Sensitivity_recall": Sensitivity_recall,
"Specificity": Specificity, "F1_score": F1_score})
{'Accuracy': 0.8828853538123972, 'Precision': 0.5040983606557377, 'Sensitivity_recall': 0.2867132867132867, 'Specificity': 0.9623873173764377, 'F1_score': 0.3655274888558693}
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, Y_pred_rf4)
auc = metrics.roc_auc_score(y_test_balanced, Y_pred_rf4)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(RandomForestClassifier - 380 estimators)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
# Accuracy
clf_rf4.score(X_test_balanced, y_test_balanced)
0.8828853538123972
from sklearn.inspection import permutation_importance
clf_rf.feature_importances_
array([0.02443374, 0.03142517, 0.0213844 , 0.20423693, 0.03928624,
0.03690086, 0.04983252, 0.02667157, 0.0249356 , 0.03164262,
0.01321561, 0.09781582, 0.05098756, 0.20879432, 0.13843703])
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.barh(X_test_balanced.columns, clf_rf.feature_importances_)
plt.title('Feature Importances obtained from coefficients - RandomForestClassifier\n')
plt.grid(False)
sorted_idx = clf_rf.feature_importances_.argsort()
plt.barh(X_test_balanced.columns[sorted_idx], clf_rf.feature_importances_[sorted_idx])
#plt.xlabel("Random Forest Feature Importance")
plt.title('Feature Importances obtained from coefficients - RandomForestClassifier\n')
plt.grid(False)
# drop the least important feature from X_balanced and X_test_balanced
X_balanced_fi = X_balanced.copy()
X_balanced_fi.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11957 | 1 | 1 | 1 | 135000.00 | 4 | 4 | 1 | 1 | 0 | 1 | 0 | 8 | 7 | 39 | 9.00 |
| 11052 | 1 | 0 | 1 | 135000.00 | 4 | 4 | 0 | 1 | 0 | 0 | 0 | 12 | 6 | 36 | 7.00 |
| 13960 | 1 | 0 | 0 | 135000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 4 | 7 | 40 | 9.00 |
| 5372 | 0 | 0 | 1 | 103500.00 | 0 | 4 | 1 | 1 | 1 | 1 | 0 | 16 | 7 | 46 | 4.00 |
| 30992 | 1 | 1 | 0 | 270000.00 | 0 | 4 | 1 | 1 | 1 | 1 | 0 | 10 | 7 | 56 | 7.00 |
X_balanced_fi = X_balanced_fi.drop(['flag_email'], axis=1)
X_balanced_fi.shape
(25519, 14)
X_balanced_fi.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11957 | 1 | 1 | 1 | 135000.00 | 4 | 4 | 1 | 1 | 0 | 1 | 8 | 7 | 39 | 9.00 |
| 11052 | 1 | 0 | 1 | 135000.00 | 4 | 4 | 0 | 1 | 0 | 0 | 12 | 6 | 36 | 7.00 |
| 13960 | 1 | 0 | 0 | 135000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 4 | 7 | 40 | 9.00 |
| 5372 | 0 | 0 | 1 | 103500.00 | 0 | 4 | 1 | 1 | 1 | 1 | 16 | 7 | 46 | 4.00 |
| 30992 | 1 | 1 | 0 | 270000.00 | 0 | 4 | 1 | 1 | 1 | 1 | 10 | 7 | 56 | 7.00 |
X_test_balanced_fi = X_test_balanced.copy()
X_test_balanced_fi.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36245 | 0 | 0 | 1 | 675000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 12 | 7 | 49 | 18.00 |
| 30510 | 0 | 0 | 1 | 270000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 0 | 6 | 7 | 49 | 27.00 |
| 7249 | 0 | 0 | 1 | 112500.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 16 | 7 | 42 | 6.00 |
| 22707 | 1 | 0 | 0 | 189000.00 | 2 | 4 | 1 | 1 | 0 | 0 | 0 | 3 | 7 | 43 | 7.00 |
| 18346 | 0 | 0 | 0 | 157500.00 | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 11 | 3 | 47 | 8.00 |
X_test_balanced_fi = X_test_balanced_fi.drop(['flag_email'], axis=1)
X_test_balanced_fi.shape
(10938, 14)
X_test_balanced_fi.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36245 | 0 | 0 | 1 | 675000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 12 | 7 | 49 | 18.00 |
| 30510 | 0 | 0 | 1 | 270000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 6 | 7 | 49 | 27.00 |
| 7249 | 0 | 0 | 1 | 112500.00 | 4 | 4 | 1 | 1 | 0 | 0 | 16 | 7 | 42 | 6.00 |
| 22707 | 1 | 0 | 0 | 189000.00 | 2 | 4 | 1 | 1 | 0 | 0 | 3 | 7 | 43 | 7.00 |
| 18346 | 0 | 0 | 0 | 157500.00 | 4 | 1 | 2 | 1 | 0 | 0 | 11 | 3 | 47 | 8.00 |
# Now, I will build the random forest model again and check accuracy
clf_rf_fi = RandomForestClassifier(n_estimators = 100, random_state = 42)
# fit the regressor with x and y data
clf_rf_fi.fit(X_balanced_fi, y_balanced)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
clf_rf_fi.predict_proba(X_test_balanced_fi)
array([[0.74740612, 0.25259388],
[1. , 0. ],
[1. , 0. ],
...,
[0.78396442, 0.21603558],
[1. , 0. ],
[0.115 , 0.885 ]])
y_pred_rf_fi = clf_rf_fi.predict(X_test_balanced_fi)
y_pred_rf_fi
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
# Calculate the accuracy of the model
print(clf_rf_fi.score(X_test_balanced_fi, y_test_balanced))
0.8804168952276468
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, y_pred_rf_fi))
print("RMSE : % f" %(rmse))
RMSE : 0.345808
# Confusion Matrix Chart
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test_balanced, y_pred_rf_fi)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(RandomForestClassifier - Feature Importance 1)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
# Drop another least significant variable
X_balanced_fi = X_balanced_fi.drop(['flag_own_realty'], axis=1)
X_balanced_fi.shape
(25519, 13)
X_balanced_fi.head()
| code_gender | flag_own_car | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11957 | 1 | 1 | 135000.00 | 4 | 4 | 1 | 1 | 0 | 1 | 8 | 7 | 39 | 9.00 |
| 11052 | 1 | 0 | 135000.00 | 4 | 4 | 0 | 1 | 0 | 0 | 12 | 6 | 36 | 7.00 |
| 13960 | 1 | 0 | 135000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 4 | 7 | 40 | 9.00 |
| 5372 | 0 | 0 | 103500.00 | 0 | 4 | 1 | 1 | 1 | 1 | 16 | 7 | 46 | 4.00 |
| 30992 | 1 | 1 | 270000.00 | 0 | 4 | 1 | 1 | 1 | 1 | 10 | 7 | 56 | 7.00 |
X_test_balanced_fi = X_test_balanced_fi.drop(['flag_own_realty'], axis=1)
X_test_balanced_fi.shape
(10938, 13)
X_test_balanced_fi.head()
| code_gender | flag_own_car | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36245 | 0 | 0 | 675000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 12 | 7 | 49 | 18.00 |
| 30510 | 0 | 0 | 270000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 6 | 7 | 49 | 27.00 |
| 7249 | 0 | 0 | 112500.00 | 4 | 4 | 1 | 1 | 0 | 0 | 16 | 7 | 42 | 6.00 |
| 22707 | 1 | 0 | 189000.00 | 2 | 4 | 1 | 1 | 0 | 0 | 3 | 7 | 43 | 7.00 |
| 18346 | 0 | 0 | 157500.00 | 4 | 1 | 2 | 1 | 0 | 0 | 11 | 3 | 47 | 8.00 |
# Now, I will build the random forest model again and check accuracy
clf_rf_fi = RandomForestClassifier(n_estimators = 100, random_state = 42)
# fit the regressor with x and y data
clf_rf_fi.fit(X_balanced_fi, y_balanced)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
clf_rf_fi.predict_proba(X_test_balanced_fi)
array([[0.74740612, 0.25259388],
[1. , 0. ],
[0.96 , 0.04 ],
...,
[0.78396442, 0.21603558],
[0.992 , 0.008 ],
[0.119 , 0.881 ]])
y_pred_rf_fi = clf_rf_fi.predict(X_test_balanced_fi)
y_pred_rf_fi
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
# Calculate the accuracy of the model
print(clf_rf_fi.score(X_test_balanced_fi, y_test_balanced))
0.8808740171877857
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, y_pred_rf_fi))
print("RMSE : % f" %(rmse))
RMSE : 0.345146
# Confusion Matrix Chart
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test_balanced, y_pred_rf_fi)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(RandomForestClassifier - Feature Importance 2)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
from sklearn.ensemble import GradientBoostingClassifier
gboost = GradientBoostingClassifier(n_estimators = 5000, max_depth = 3, random_state = 42)
gboost.fit(X_balanced, y_balanced)
GradientBoostingClassifier(n_estimators=5000, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingClassifier(n_estimators=5000, random_state=42)
gboost.predict_proba(X_test_balanced)
array([[0.77102469, 0.22897531],
[0.97609289, 0.02390711],
[0.92565536, 0.07434464],
...,
[0.86420675, 0.13579325],
[0.96229184, 0.03770816],
[0.33288448, 0.66711552]])
preds_gb = gboost.predict(X_test_balanced)
preds_gb
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)
preds_gb.shape
(10938,)
# Calculate the accuracy of the model
print(gboost.score(X_test_balanced, y_test_balanced))
0.8833424757725361
# RMSE Computation
rmse = np.sqrt(MSE(y_test_balanced, preds_gb))
print("RMSE : % f" %(rmse))
RMSE : 0.341552
# Confusion Matrix Chart
from sklearn.metrics import confusion_matrix
confusion_matrix = metrics.confusion_matrix(y_test_balanced, preds_gb)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Eligible: 0', 'Not Eligible: 1'])
cm_display.plot(cmap = 'viridis', colorbar = False, xticks_rotation='horizontal')
cm_display.ax_.set_title("CONFUSION MATRIX\n" + "(GradientBoostingClassifier)\n")
plt.yticks(rotation = 90)
plt.grid(False)
plt.show()
print(classification_report(y_test_balanced, preds_gb))
precision recall f1-score support
0 0.89 0.98 0.94 9651
1 0.52 0.13 0.20 1287
accuracy 0.88 10938
macro avg 0.71 0.56 0.57 10938
weighted avg 0.85 0.88 0.85 10938
# AUC - ROC
fpr, tpr, _ = metrics.roc_curve(y_test_balanced, preds_gb)
auc = metrics.roc_auc_score(y_test_balanced, preds_gb)
# ax = plt.axes()
plt.plot(fpr, tpr, label = 'AUC Score = %.4f'%auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC CURVE' + "\n(GradientBoostingClassifier)\n")
plt.legend(loc = 4)
# ax.set_facecolor("grey")
plt.grid(False)
plt.show()
predictions = pd.DataFrame(y_pred_rf)
predictions
| 0 | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
| ... | ... |
| 10933 | 0 |
| 10934 | 0 |
| 10935 | 0 |
| 10936 | 0 |
| 10937 | 1 |
10938 rows × 1 columns
# Save the Predicted Values to a .csv file
predictions.to_csv('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\predictions.csv')
# Add prediction column to the test dataframe X_test
predictions.rename(columns = {0:'predicted_status'}, inplace = True)
predictions.head()
| predicted_status | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 0 |
tested_df = X_test_balanced.copy()
tested_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36245 | 0 | 0 | 1 | 675000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 12 | 7 | 49 | 18.00 |
| 30510 | 0 | 0 | 1 | 270000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 0 | 6 | 7 | 49 | 27.00 |
| 7249 | 0 | 0 | 1 | 112500.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 16 | 7 | 42 | 6.00 |
| 22707 | 1 | 0 | 0 | 189000.00 | 2 | 4 | 1 | 1 | 0 | 0 | 0 | 3 | 7 | 43 | 7.00 |
| 18346 | 0 | 0 | 0 | 157500.00 | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 11 | 3 | 47 | 8.00 |
tested_df['predicted_status'] = predictions['predicted_status']
tested_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | predicted_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36245 | 0 | 0 | 1 | 675000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 12 | 7 | 49 | 18.00 | NaN |
| 30510 | 0 | 0 | 1 | 270000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 0 | 6 | 7 | 49 | 27.00 | NaN |
| 7249 | 0 | 0 | 1 | 112500.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 16 | 7 | 42 | 6.00 | 0.00 |
| 22707 | 1 | 0 | 0 | 189000.00 | 2 | 4 | 1 | 1 | 0 | 0 | 0 | 3 | 7 | 43 | 7.00 | NaN |
| 18346 | 0 | 0 | 0 | 157500.00 | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 11 | 3 | 47 | 8.00 | NaN |
tested_df.predicted_status.value_counts()
0.00 3049 1.00 212 Name: predicted_status, dtype: int64
# Save the final DataFrame to .csv
final_df = tested_df.copy()
final_df.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | predicted_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36245 | 0 | 0 | 1 | 675000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 12 | 7 | 49 | 18.00 | NaN |
| 30510 | 0 | 0 | 1 | 270000.00 | 4 | 4 | 1 | 1 | 0 | 0 | 0 | 6 | 7 | 49 | 27.00 | NaN |
| 7249 | 0 | 0 | 1 | 112500.00 | 4 | 4 | 1 | 1 | 0 | 0 | 1 | 16 | 7 | 42 | 6.00 | 0.00 |
| 22707 | 1 | 0 | 0 | 189000.00 | 2 | 4 | 1 | 1 | 0 | 0 | 0 | 3 | 7 | 43 | 7.00 | NaN |
| 18346 | 0 | 0 | 0 | 157500.00 | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 11 | 3 | 47 | 8.00 | NaN |
final_df.to_csv('D:\\D - Drive\\IPBA\\BYOP\\Capstone Project\\Final - Credit Card Approval Model\\final_df.csv', index = False)
model_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 36457 entries, 0 to 36456 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code_gender 36457 non-null int32 1 flag_own_car 36457 non-null int32 2 flag_own_realty 36457 non-null int32 3 amt_income_total 36457 non-null float64 4 name_income_type 36457 non-null int32 5 name_education_type 36457 non-null int32 6 name_family_status 36457 non-null int32 7 name_housing_type 36457 non-null int32 8 flag_work_phone 36457 non-null int32 9 flag_phone 36457 non-null int32 10 flag_email 36457 non-null int32 11 occupation_type 36457 non-null int32 12 status 36457 non-null int64 13 cnt_fam_members_bucket 36457 non-null int32 14 age 36457 non-null int32 15 employed_years 36457 non-null float64 dtypes: float64(2), int32(13), int64(1) memory usage: 2.6 MB
X_balanced.head()
| code_gender | flag_own_car | flag_own_realty | amt_income_total | name_income_type | name_education_type | name_family_status | name_housing_type | flag_work_phone | flag_phone | flag_email | occupation_type | cnt_fam_members_bucket | age | employed_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11957 | 1 | 1 | 1 | 135000.00 | 4 | 4 | 1 | 1 | 0 | 1 | 0 | 8 | 7 | 39 | 9.00 |
| 11052 | 1 | 0 | 1 | 135000.00 | 4 | 4 | 0 | 1 | 0 | 0 | 0 | 12 | 6 | 36 | 7.00 |
| 13960 | 1 | 0 | 0 | 135000.00 | 4 | 4 | 1 | 1 | 1 | 0 | 0 | 4 | 7 | 40 | 9.00 |
| 5372 | 0 | 0 | 1 | 103500.00 | 0 | 4 | 1 | 1 | 1 | 1 | 0 | 16 | 7 | 46 | 4.00 |
| 30992 | 1 | 1 | 0 | 270000.00 | 0 | 4 | 1 | 1 | 1 | 1 | 0 | 10 | 7 | 56 | 7.00 |
X_balanced.shape
(25519, 15)
X_balanced = np.array([['F', 'N', 'Y', 27000, "Working", "Higher education", "Civil marriage", "House / apartment", 'Y', 'N', 'N', "Managers",
'Two', 40, 13]])
X_balanced
array([['F', 'N', 'Y', '27000', 'Working', 'Higher education',
'Civil marriage', 'House / apartment', 'Y', 'N', 'N', 'Managers',
'Two', '40', '13']], dtype='<U17')
X_balanced[:, 0] = le_code_gender.transform(X_balanced[:, 0])
X_balanced[:, 1] = le_flag_own_car.transform(X_balanced[:, 1])
X_balanced[:, 2] = le_flag_own_realty.transform(X_balanced[:, 2])
X_balanced[:, 4] = le_name_income_type.transform(X_balanced[:, 4])
X_balanced[:, 5] = le_name_education_type.transform(X_balanced[:, 5])
X_balanced[:, 6] = le_name_family_status.transform(X_balanced[:, 6])
X_balanced[:, 7] = le_name_housing_type.transform(X_balanced[:, 7])
X_balanced[:, 8] = le_flag_work_phone.transform(X_balanced[:, 8])
X_balanced[:, 9] = le_flag_phone.transform(X_balanced[:, 9])
X_balanced[:, 10] = le_flag_email.transform(X_balanced[:, 10])
X_balanced[:, 11] = le_occupation_type.transform(X_balanced[:, 11])
X_balanced[:, 12] = le_cnt_fam_members_bucket.transform(X_balanced[:, 12])
X_balanced = X_balanced.astype(int)
X_balanced
array([[ 0, 0, 1, 27000, 4, 1, 0, 1, 1,
0, 0, 10, 7, 40, 13]])
y_pred_rf = clf_rf.predict(X_balanced)
print(y_pred_rf)
[1]
if (y_pred_rf[0] == 0):
print('Congratulations! You are ELIGIBLE for the Credit Card!')
else:
print('Sorry! You are NOT ELIGIBLE for the Credit Card!')
Sorry! You are NOT ELIGIBLE for the Credit Card!
import pickle
data = {"model" : clf_rf, "le_code_gender" : le_code_gender, "le_flag_own_car" : le_flag_own_car,
"le_flag_own_realty" : le_flag_own_realty, "le_name_income_type" : le_name_income_type,
"le_name_education_type" : le_name_education_type, "le_name_family_status" : le_name_family_status,
"le_name_housing_type" : le_name_housing_type, "le_flag_work_phone" : le_flag_work_phone,
"le_flag_phone" : le_flag_phone, "le_flag_email" : le_flag_email, "le_occupation_type" : le_occupation_type,
"le_cnt_fam_members_bucket" : le_cnt_fam_members_bucket}
with open('saved_steps.pkl', 'wb') as file:
pickle.dump(data, file)
with open('saved_steps.pkl', 'rb') as file:
data = pickle.load(file)
clf_rf_loaded = data["model"]
le_code_gender = data["le_code_gender"]
le_flag_own_car = data["le_flag_own_car"]
le_flag_own_realty = data["le_flag_own_realty"]
le_name_income_type = data["le_name_income_type"]
le_name_education_type = data["le_name_education_type"]
le_name_family_status = data["le_name_family_status"]
le_name_housing_type = data["le_name_housing_type"]
le_flag_work_phone = data["le_flag_work_phone"]
le_flag_phone = data["le_flag_phone"]
le_flag_email = data["le_flag_email"]
le_occupation_type = data["le_occupation_type"]
le_cnt_fam_members_bucket = data["le_cnt_fam_members_bucket"]
round(clf_rf.predict_proba(X_balanced)[:, 0][0] * 100, 2)
43.56
type(clf_rf.predict_proba(X_balanced)[:, 0][0])
numpy.float64
y_pred_rf = clf_rf_loaded.predict(X_balanced)
y_pred_rf
array([1], dtype=int64)
if (y_pred_rf[0] == 0):
print('Congratulations! You are ELIGIBLE for the Credit Card!')
else:
print('Sorry! You are NOT ELIGIBLE for the Credit Card!')
Sorry! You are NOT ELIGIBLE for the Credit Card!